diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 83006280b127cf75f38488f3709e7b27b4cafc33..c57876f1838eeca8c03ac0b7d7c9df9e03bc7d13 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -206,6 +206,15 @@ stages:
         - pip install requests[security] twine 
         # Execute the publish script for real this time
         - MB_PYTHON_TAG=$MB_PYTHON_TAG USE_GPG=True GPG_KEYID=$GPG_KEYID TWINE_PASSWORD=$TWINE_PASSWORD TWINE_USERNAME=$TWINE_USERNAME GPG_EXECUTABLE=$GPG_EXECUTABLE CURRENT_BRANCH=release DEPLOY_BRANCH=release TAG_AND_UPLOAD=yes ./publish.sh 
+        # Have the server git-tag the release and push the tags
+        - VERSION=$(python -c "import setup; print(setup.VERSION)")
+        # do sed twice to handle the case of https clone with and without a read token
+        - URL_HOST=$(git remote get-url origin | sed -e 's|https\?://.*@||g' | sed -e 's|https\?://||g')
+        - echo "URL_HOST = $URL_HOST"
+        - git config user.email "ci@gitlab.kitware.com"
+        - git config user.name "Gitlab-CI"
+        - git tag $VERSION -m "tarball tag $VERSION"
+        - git push --tags "https://${GIT_PUSH_TOKEN}@${URL_HOST}"
             
     only:
         refs:
@@ -337,17 +346,17 @@ test_full/cp35-cp35m-linux:
 # ---------------
 # Python 2.7 Jobs
 
-build/cp27-cp27mu-linux:
-    <<: 
-        - *build_template
-    image:
-        python:2.7
+#build/cp27-cp27mu-linux:
+#    <<: 
+#        - *build_template
+#    image:
+#        python:2.7
 
-test_full/cp27-cp27mu-linux:
-    <<: 
-        - *test_full_template
-    image:
-        python:2.7
+#test_full/cp27-cp27mu-linux:
+#    <<: 
+#        - *test_full_template
+#    image:
+#        python:2.7
 
 #gpgsign/cp27-cp27mu-linux:
 #    <<: 
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f970600fd4a341ac09eff026821360cdf0fc8018..d2aa56e47dbef398d075863e7e97c4533d6ddb39 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,31 @@
 This changelog follows the specifications detailed in: [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html), although we have not yet reached a `1.0.0` release.
 
-## Version 0.5.4
+
+## Version 0.5.5
+
+### Added
+* Timeout to FitHarn.preferences
+* Basic gradient logging
+* Several new functions are now registered with OutputShapeFor to support efficientnet (F.pad, F.conv2d, torch.sigmoid)
+* Balanced batch samplers 
+
+### Changed
+* Hyperparams "name" can now be specified instead of "nice". We will transition
+  from "nice" to "name", for now both are supported but "nice" will eventually be deprecated.
+* FitHarn.preferences now uses scriptconfig, which means the "help" sections are coupled with the object.
+* Removed explicit support for Python 2.7
+* Reverted default of `keyboard_debug` to True.
+* Moved `analytic_for`, `output_shape_for`, and `receptive_field` for to the netharn.analytic subpackage. Original names are still available, but deprecated and will be removed in a future version. 
+* Moved helpers from `netharn.hyperparams` to `netharn.util`
+* Made pytorch-optimizer optional: https://github.com/jettify/pytorch-optimizer
+* netharn now will timeout within an epoch
+
+### Fixed
+* Bug when a value in `harn.intervals` was zero.
+
+
+## Version 0.5.4 - Released 2020-02-19 
 
 ### Added
 * EfficientNet backbone and Swish activation
diff --git a/README.rst b/README.rst
index 416e0362e7d55cc5b3a78fcfc9ff0c781b6553ff..944cd487948641db68e90fcb0b92afbf3b55a78c 100644
--- a/README.rst
+++ b/README.rst
@@ -6,6 +6,14 @@ NetHarn - a PyTorch Network Harness
 
 The main webpage for this project is: https://gitlab.kitware.com/computer-vision/netharn
 
+If you want a framework for your pytorch training loop that
+(1) chooses directory names based on hashes of hyperparameters,
+(2) can write a single-file deployment of your model by statically auto-extracting the in-code definition of the model topology and zipping it with the weights, 
+(3) has brief terminal output and a rich logging output, 
+(4) has rule-based monitoring of validation loss and can reduce the learning rate or early stop, 
+(5) has tensorboard and/or matplotlib visualizations of training statistics, and 
+(6) is designed to be extended, then you might be interested in NetHarn. 
+
 NAME:
     NetHarn (pronounced "net-harn")
 FRAMEWORK:
@@ -29,22 +37,25 @@ BUILTINS:
    - tensorboard metric visualization (optional)
 DESIGN PHILOSOPHY: 
    Avoid boilerplate, built-it yourself when you need to, and don't repeat yourself.
+   Experiments should be strongly tied to the choice of hyperparameters, and
+   the framework should be able to construct a directory heirarchy based on
+   these hyperparameters.
 SLOGAN: 
     Rein and train.
 USAGE PATTERNS:
     (1) Write code for a torch object  (i.e. Dataset, Model, Criterion, Initializer, and Scheduler) just as you normally would.
-    (2) Inherit from the ``nh.FitHarn`` object, define ``run_batch``, ``on_batch``, ``on_epoch``, etc...
-    (3) Create an instance of ``nh.HyperParams`` to specify your dataset, model, criterion, etc...
+    (2) Inherit from the ``netharn.FitHarn`` object, define ``run_batch``, ``on_batch``, ``on_epoch``, etc...
+    (3) Create an instance of ``netharn.HyperParams`` to specify your dataset, model, criterion, etc...
     (4) Create an instance of your ``FitHarn`` object with those hyperparameters.
     (5) Then execute its ``run`` method.
     (6) ???
     (7) profit
 EXAMPLES:
-    * ToyData2d classification with nh.models.ToyNet2d (see doctest in netharn/fit_harn.py:__DOC__:0)
-    * MNIST digit classification with MnistNet (examples/mnist.py)
-    * Cifar10 category classification with ResNet50 / dpn91 (examples/cifar.py)
-    * Voc2007+2012 object detection with YOLOv2 (examples/yolo_voc.py)
-    * IBEIS metric learning with SiameseLP (examples/siam_ibeis.py)
+    * ToyData2d classification with netharn.models.ToyNet2d (see doctest in netharn/fit_harn.py:__DOC__:0)
+    * MNIST digit classification with MnistNet (netharn/examples/mnist.py)
+    * Cifar10 category classification with ResNet50 / dpn91 (netharn/examples/cifar.py)
+    * Voc2007+2012 object detection with YOLOv2 (netharn/examples/yolo_voc.py)
+    * IBEIS metric learning with SiameseLP (netharn/examples/siam_ibeis.py)
 STABILITY:
    Mostly harmless. Most tests pass, the current failures are probably not
    critical. I'm able to use it on my machine (tm). In this early stage of
@@ -61,7 +72,7 @@ AUTHORS COMMENTS:
      this results a few times. You can use the code in examples/cifar.py to see
      if you can too (please tell me if you cannot). 
    * The YOLO example is based of of EAVise's excellent lightnet (https://gitlab.com/EAVISE/lightnet/) package.
-   * I reimplemented the CocoAPI (see nh.data.coco_api), because I had some
+   * I reimplemented the CocoAPI (see netharn.data.coco_api), because I had some
      (probably minor) issue with the original implementation. I've extended it
      quite a bit, and I'd recommend using it.
    * The metric-learning example requires code requires the ibeis software:
@@ -74,6 +85,52 @@ DEPENDENCIES:
     * xdoctest
     * ... (see requirements.txt)
 
+
+Features (continued)
+====================
+
+* Hyperparameter tracking: The hash of your hyperparameters determines the
+  directory data will be written to. We also allow for a "nicer" means to
+  manage directory structures. Given a ``HyperParams`` object, we create the
+  symlink ``{workdir}/fit/nice/{nice}`` which points to
+  ``{workdir}/fit/runs/{nice}/{hashid}``.
+
+* Automatic restarts: 
+  Calling ``FitHarn.run`` twice restarts training from where you left off by
+  default (as long as the hyperparams haven't changed).
+
+* "Smart" Snapshot cleanup:  
+  Maintaining model weights files can be a memory hog. Depending the settings
+  of ``harn.preferences``, ``netharn.FitHarn`` will periodically remove
+  less-recent or low-scoring snapshots.
+
+* Deployment files: 
+  Model weights and architecture are together written as one
+  reasonably-portable zip-file. We also package training metadata to maintain
+  data provinence and make reproducing experiments easier. 
+
+* Restart from any pretrained state: 
+  use ``netharn.initializers.PretainedInitializer``. 
+
+* Utilities for building networks in torch:
+  Layers like ``netharn.layers.ConvNormNd`` make it easy to build networks for
+  n=1, 2, or 3 dimensional data. 
+
+* Analytic output shape and receptive field:
+  Netharn defines a ``netharn.layers.AnalyticModule``, which can automatically
+  define ``forward``, ``output_shape_for`` and ``receptive_field_for`` if users
+  define a special ``_output_for`` method, written with the
+  ``netharn.analytic_for.Output``, ``netharn.analytic_for.Hidden``, and
+  ``netharn.analytic_for.OutputFor`` special callables.
+
+* Example tasks:
+  Baseline code for standard tasks like: object segmentation, classification,
+  and detection are defined in ``netharn.examples``. The examples also provide
+  example use cases for ``ndsampler``, ``kwimage``, ``kwannot``, and
+  ``kwplot``. 
+
+
+
 Installation
 ============
 
@@ -87,6 +144,15 @@ setup instructions, but for now they are the same.
     cd ~/code/netharn
     ./run_developer_setup.sh
 
+
+While all netharn dependencies should be available on pypi (with manylinux2010
+wheels for binary packages), there are other packages developed concurrently
+with netharn. To install the development version of these dependencies then run
+``python super_setup.py ensure`` to check out the repos and ensure they are on
+the correct branch, ``python super_setup.py develop`` to build everything in
+development mode, and ``python super_setup.py pull`` to update to the latest on
+the branch.
+
 Description
 ===========
 
@@ -140,13 +206,13 @@ Example:
 This following example is the doctest in ``netharn/fit_harn.py``. It
 demonstrates how to use NetHarn to train a model to solve a toy problem.  
 
-In this toy problem, we do not extend the nh.FitHarn object, so we are using
+In this toy problem, we do not extend the netharn.FitHarn object, so we are using
 the default behavior of ``run_batch``. The default ``on_batch``, and
 ``on_epoch`` do nothing, so only loss will be the only measurement of
 performance.
 
 For further examples please see the examples directory. These example show how
-to extend nh.FitHarn to measure performance wrt a particular problem.  The
+to extend netharn.FitHarn to measure performance wrt a particular problem.  The
 MNIST and CIFAR examples are the most simple. The YOLO example is more complex.
 The IBEIS example depends on non-public data / software, but can still be
 useful to look at.  Its complexity is more than CIFAR but less than YOLO.
@@ -154,51 +220,52 @@ useful to look at.  Its complexity is more than CIFAR but less than YOLO.
 
 .. code-block:: python
 
-    >>> import netharn as nh
-    >>> hyper = nh.HyperParams(**{
+    >>> import netharn 
+    >>> hyper = netharn.HyperParams(**{
     >>>     # ================
     >>>     # Environment Components
     >>>     'workdir'     : ub.ensure_app_cache_dir('netharn/demo'),
     >>>     'nice'        : 'demo',
-    >>>     'xpu'         : nh.XPU.cast('auto'),
+    >>>     'xpu'         : netharn.XPU.cast('auto'),
     >>>     # workdir is a directory where intermediate results can be saved
     >>>     # nice symlinks <workdir>/fit/nice/<nice> -> ../runs/<hashid>
     >>>     # XPU auto select a gpu if idle and VRAM>6GB else a cpu
     >>>     # ================
     >>>     # Data Components
     >>>     'datasets'    : {  # dict of plain ol torch.data.Dataset instances
-    >>>         'train': nh.data.ToyData2d(size=3, border=1, n=256, rng=0),
-    >>>         'vali': nh.data.ToyData2d(size=3, border=1, n=128, rng=1),
-    >>>         'test': nh.data.ToyData2d(size=3, border=1, n=128, rng=2),
+    >>>         'train': netharn.data.ToyData2d(size=3, border=1, n=256, rng=0),
+    >>>         'vali': netharn.data.ToyData2d(size=3, border=1, n=128, rng=1),
+    >>>         'test': netharn.data.ToyData2d(size=3, border=1, n=128, rng=2),
     >>>     },
     >>>     'loaders'     : {'batch_size': 64}, # DataLoader instances or kw
     >>>     # ================
     >>>     # Algorithm Components
     >>>     # Note the (cls, kw) tuple formatting
-    >>>     'model'       : (nh.models.ToyNet2d, {}),
-    >>>     'optimizer'   : (nh.optimizers.SGD, {
+    >>>     'model'       : (netharn.models.ToyNet2d, {}),
+    >>>     'optimizer'   : (netharn.optimizers.SGD, {
     >>>         'lr': 0.0001
     >>>     }),
-    >>>     # focal loss is usually better than nh.criterions.CrossEntropyLoss
-    >>>     'criterion'   : (nh.criterions.FocalLoss, {}),
-    >>>     'initializer' : (nh.initializers.KaimingNormal, {
+    >>>     # focal loss is usually better than netharn.criterions.CrossEntropyLoss
+    >>>     'criterion'   : (netharn.criterions.FocalLoss, {}),
+    >>>     'initializer' : (netharn.initializers.KaimingNormal, {
     >>>         'param': 0,
     >>>     }),
     >>>     # these may receive an overhaul soon
-    >>>     'scheduler'   : (nh.schedulers.ListedScheduler, {
+    >>>     'scheduler'   : (netharn.schedulers.ListedScheduler, {
     >>>         'points': {'lr': {0: .0001, 2: .01, 5: .015, 6: .005, 9: .001}},
     >>>         'interpolation': 'linear',
     >>>     }),
-    >>>     'monitor'     : (nh.Monitor, {
+    >>>     'monitor'     : (netharn.Monitor, {
     >>>         'max_epoch': 10,
     >>>     }),
     >>>     # dynamics are a config option that modify the behavior of the main
     >>>     # training loop. These parameters effect the learned model.
     >>>     'dynamics'   : {'batch_step': 4},
     >>> })
-    >>> harn = nh.FitHarn(hyper)
+    >>> harn = netharn.FitHarn(hyper)
     >>> # non-algorithmic behavior configs (do not change learned models)
-    >>> harn.config['prog_backend'] = 'progiter'  # alternative: 'tqdm'
+    >>> harn.preferences['prog_backend'] = 'progiter'  # alternative: 'tqdm'
+    >>> harn.preferences['num_keep'] = 10
     >>> # start training.
     >>> harn.initialize(reset='delete')
     >>> harn.run()  # note: run calls initialize it hasn't already been called.
@@ -269,8 +336,8 @@ Running this code produes the following output:
    INFO: wrote single-file deployment to: '/home/joncrall/.cache/netharn/demo/fit/runs/demo/lnejaaum/deploy_ToyNet2d_lnejaaum_009_GAEYQT.zip'
    INFO: exiting fit harness.
 
-Furthermore, if you were to run that code when `'--verbose' in sys.argv`, then
-it would produce this more detailed description of what it was doing:
+Furthermore, if you were to run that code when ``'--verbose' in sys.argv``,
+then it would produce this more detailed description of what it was doing:
 
 .. code-block:: 
 
@@ -474,9 +541,6 @@ it would produce this more detailed description of what it was doing:
    INFO: exiting fit harness.
 
 
-]
-
-
 .. |Pypi| image:: https://img.shields.io/pypi/v/netharn.svg
    :target: https://pypi.python.org/pypi/netharn
 
diff --git a/analytic/analytic_for.py b/analytic/analytic_for.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ef73f1c5554c060bcbbab8121559822b7fa9564
--- /dev/null
+++ b/analytic/analytic_for.py
@@ -0,0 +1,2 @@
+# TODO: new api
+from netharn.analytic.analytic_for import *
diff --git a/analytic/output_shape_for.py b/analytic/output_shape_for.py
new file mode 100644
index 0000000000000000000000000000000000000000..b792abb86ef2ef5e2e1ccb7383600a108b593717
--- /dev/null
+++ b/analytic/output_shape_for.py
@@ -0,0 +1,2 @@
+# TODO: new api
+from netharn.analytic.output_shape_for import *
diff --git a/analytic/receptive_field_for.py b/analytic/receptive_field_for.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8f17dd69bdeef3da21f6375817542467bf6a162
--- /dev/null
+++ b/analytic/receptive_field_for.py
@@ -0,0 +1,2 @@
+# TODO: new api
+from netharn.analytic.receptive_field_for import *
diff --git a/dev/cifar_notes.txt b/dev/cifar_notes.txt
new file mode 100644
index 0000000000000000000000000000000000000000..611429603c10c5e974bab4e4a4b9809598074dfc
--- /dev/null
+++ b/dev/cifar_notes.txt
@@ -0,0 +1,196 @@
+
+
+
+
+https://arxiv.org/pdf/1905.11946.pdf#page=10&zoom=100,0,0 
+
+We train our EfficientNet models on ImageNet using similar settings as (Tan et al., 2019): RMSProp optimizer with decay 0.9 and momentum 0.9; batch norm momentum 0.99; weight decay 1e-5; initial learning rate 0.256 that decays by 0.97 every 2.4 epochs. We also use swish activation (Ramachandran et al., 2018; Elfwing et al., 2018), fixed AutoAugment policy (Cubuk et al., 2019), and stochastic depth (Huang et al., 2016) with survival probability 0.8. As commonly known that bigger models need more regularization, we linearly increase dropout (Srivastava et al., 2014) ratio from 0.2 for EfficientNet-B0 to 0.5 for EfficientNet-B7.
+
+
+Hello Jon,
+
+Thanks for the interest. I mostly use the same settings (e.g., optimizer, weight decay, batch size) as ImageNet, except changing the learning rate to be 1/4 of the original ImageNet learning rate. One thing to notice is that I scale up all images to be the same as ImageNet size (i.e., 224 for B0). 
+
+Best,
+Mingxing
+
+
+
+-----
+
+Cubuk et al., 2019
+https://arxiv.org/pdf/1805.09501.pdf
+
+The baseline pre-processing follows the convention for
+state-of-the-art CIFAR-10 models: standardizing the data,
+using horizontal flips with 50% probability, zero-padding
+and random crops, and finally Cutout with 16x16 pixels [17, 65, 48, 72].
+
+
+Operation 1 Operation 2
+Sub-policy 0 (Invert,0.1,7) (Contrast,0.2,6)
+Sub-policy 1 (Rotate,0.7,2) (TranslateX,0.3,9)
+Sub-policy 2 (Sharpness,0.8,1) (Sharpness,0.9,3)
+Sub-policy 3 (ShearY,0.5,8) (TranslateY,0.7,9)
+Sub-policy 4 (AutoContrast,0.5,8) (Equalize,0.9,2)
+Sub-policy 5 (ShearY,0.2,7) (Posterize,0.3,7)
+Sub-policy 6 (Color,0.4,3) (Brightness,0.6,7)
+Sub-policy 7 (Sharpness,0.3,9) (Brightness,0.7,9)
+Sub-policy 8 (Equalize,0.6,5) (Equalize,0.5,1)
+Sub-policy 9 (Contrast,0.6,7) (Sharpness,0.6,5)
+Sub-policy 10 (Color,0.7,7) (TranslateX,0.5,8)
+Sub-policy 11 (Equalize,0.3,7) (AutoContrast,0.4,8)
+Sub-policy 12 (TranslateY,0.4,3) (Sharpness,0.2,6)
+Sub-policy 13 (Brightness,0.9,6) (Color,0.2,8)
+Sub-policy 14 (Solarize,0.5,2) (Invert,0.0,3)
+Sub-policy 15 (Equalize,0.2,0) (AutoContrast,0.6,0)
+Sub-policy 16 (Equalize,0.2,8) (Equalize,0.6,4)
+Sub-policy 17 (Color,0.9,9) (Equalize,0.6,6)
+Sub-policy 18 (AutoContrast,0.8,4) (Solarize,0.2,8)
+Sub-policy 19 (Brightness,0.1,3) (Color,0.7,0)
+Sub-policy 20 (Solarize,0.4,5) (AutoContrast,0.9,3)
+Sub-policy 21 (TranslateY,0.9,9) (TranslateY,0.7,9)
+Sub-policy 22 (AutoContrast,0.9,2) (Solarize,0.8,3)
+Sub-policy 23 (Equalize,0.8,8) (Invert,0.1,3)
+Sub-policy 24 (TranslateY,0.7,9) (AutoContrast,0.9,1)
+Table 7. AutoAugment policy found on reduced CIFAR-10.
+
+
+On CIFAR-10, AutoAugment picks mostly color-based
+transformations. For example, the most commonly picked
+transformations on CIFAR-10 are Equalize, AutoContrast,
+Color, and Brightness (refer to Table 1 in the Appendix for
+their descriptions). Geometric transformations like ShearX
+and ShearY are rarely found in good policies. Furthermore,
+the transformation Invert is almost never applied in a successful policy. 
+
+-----
+Tan et al., 2019
+
+
+https://arxiv.org/pdf/1807.11626.pdf
+
+For full ImageNet training, we use RMSProp optimizer
+with decay 0.9 and momentum 0.9. Batch norm is added
+after every convolution layer with momentum 0.99, and
+weight decay is 1e-5. Dropout rate 0.2 is applied to the last
+layer. Following [7], learning rate is increased from 0 to
+0.256 in the first 5 epochs, and then decayed by 0.97 every
+2.4 epochs. We use batch size 4K and Inception preprocessing with image size 224×224. For COCO training, we plug
+our learned model into SSD detector [22] and use the same
+settings as [29], including input size 320 × 320.
+
+
+
+
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v1 \
+    --xpu=0 \
+    --arch=efficientnet-b0 --optim=adamw \
+    --schedule=onecycle250-p150 \
+    --init=cls \
+    --batch_size=2048 --lr=0.01 --decay=1e-4
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v1-continue \
+    --xpu=0 \
+    --arch=efficientnet-b0 --optim=sgd \
+    --schedule=onecycle250-p20 \
+    --batch_size=128 --lr=0.001 --decay=1e-4 \
+    --init=pretrained \
+    --pretrained=/home/joncrall/work/cifar/fit/nice/efficientnet_wip-v1/torch_snapshots/_epoch_00000020.pt
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v1-continue-alt \
+    --xpu=0 \
+    --arch=efficientnet-b0 --optim=sgd \
+    --schedule=onecycle250-p20 \
+    --batch_size=128 --lr=0.001 --decay=1e-4 \
+    --init=pretrained \
+    --pretrained=/home/joncrall/work/cifar/fit/nice/efficientnet_wip-v1/torch_snapshots/_epoch_00000020.pt
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v1-continue-alt4 \
+    --xpu=0 \
+    --arch=efficientnet-b0 --optim=adamw \
+    --schedule=Exponential-g0.98-s1 \
+    --batch_size=64 --lr=0.00001 --decay=1e-4 \
+    --init=pretrained \
+    --pretrained=/home/joncrall/work/cifar/fit/nice/efficientnet_wip-v1/torch_snapshots/_epoch_00000020.pt
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v2 \
+    --xpu=0 \
+    --arch=efficientnet-b0 --optim=adamw \
+    --schedule=onecycle250-p15 \
+    --init=cls \
+    --batch_size=2048 --lr=0.01 --decay=1e-4
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v2 \
+    --xpu=0 \
+    --arch=efficientnet-b0 --optim=adamw \
+    --schedule=onecycle250-p15 \
+    --init=cls \
+    --batch_size=2048 --lr=0.01 --decay=1e-4
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v3 \
+    --xpu=0 \
+    --arch=efficientnet-b0 --optim=adamw \
+    --schedule=onecycle250-p10 \
+    --init=cls \
+    --batch_size=512 --lr=0.01 --decay=1e-4
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v4 \
+    --xpu=0 \
+    --arch=efficientnet-b0 --optim=adamw \
+    --schedule=onecycle250-p10 \
+    --init=cls \
+    --batch_size=1024 --lr=0.001 --decay=1e-4
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v5 \
+    --xpu=0 \
+    --arch=efficientnet-b0 --optim=adamw \
+    --schedule=onecycle100-p10 \
+    --init=cls \
+    --batch_size=1024 --lr=0.02 --decay=1e-4
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v6 \
+    --xpu=0 \
+    --arch=efficientnet-b0 --optim=adamw \
+    --schedule=onecycle350-p5 \
+    --init=cls \
+    --batch_size=2048 --lr=0.003 --decay=5e-5
+
+python -m netharn.examples.cifar --nice=efficientnet_wip-v8 \
+    --xpu=0 \
+    --arch=efficientnet-b3 --optim=adamw \
+    --schedule=onecycle350-p5 \
+    --init=noop \
+    --batch_size=128 --lr=0.001 --decay=1e-4
+
+
+
+        python -m netharn.examples.cifar --xpu=0 --nice=resnet50_batch128 --arch=resnet50 --optim=sgd --schedule=step-150-250 --lr=0.1 --batch_size=128
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet_scratch-v4 --arch=efficientnet-b0 --optim=sgd --schedule=step-150-250 --lr=0.01 --init=noop --decay=1e-5
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet_scratch-v5 --arch=efficientnet-b0 --optim=sgd --schedule=step-30-200 --lr=0.01 --init=noop --decay=1e-5
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet \
+            --arch=efficientnet-b0 --optim=rmsprop --lr=0.064 \
+            --batch_size=512 --max_epoch=120 --schedule=Exponential-g0.97-s2
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet-scratch3 \
+            --arch=efficientnet-b0 --optim=adamw --lr=0.016 --init=noop \
+            --batch_size=1024 --max_epoch=450 --schedule=Exponential-g0.96-s3 --decay=1e-5
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet-pretrained2 \
+            --arch=efficientnet-b0 --optim=adamw --lr=0.0064 --init=cls \
+            --batch_size=512 --max_epoch=350 --schedule=Exponential-g0.97-s2 --decay=0
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet-pretrained6 \
+            --arch=efficientnet-b0 --optim=sgd --lr=0.016 --init=cls \
+            --batch_size=1024 --max_epoch=350 --schedule=Exponential-g0.97-s3 --decay=1e-5
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet-pretrained7 \
+            --arch=efficientnet-b0 --optim=sgd --lr=0.016 --init=cls \
+            --batch_size=1024 --max_epoch=350 --schedule=Exponential-g0.97-s3 --decay=1e-5 --bstep=4
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet-pretrained7 \
+            --arch=efficientnet-b0 --optim=sgd --lr=0.016 --init=cls \
+            --batch_size=1024 --max_epoch=350 --schedule=step-30-100 --decay=1e-5 --bstep=4
diff --git a/netharn/__init__.py b/netharn/__init__.py
index 1ab2e0aba5eb8cfcaa903b94be0e4849327f84d2..65a9b202c04cdfabd99e4f99c0d6d0e655ad2628 100644
--- a/netharn/__init__.py
+++ b/netharn/__init__.py
@@ -4,7 +4,7 @@
 mkinit netharn --noattrs --dry
 mkinit netharn --noattrs
 """
-__version__ = '0.5.4'
+__version__ = '0.5.5'
 
 try:
     # PIL 7.0.0 removed PIL_VERSION, which breaks torchvision, monkey patch it
@@ -22,6 +22,7 @@ try:
 except AttributeError:
     pass
 
+
 from netharn.api import (
     Initializer, Optimizer, Criterion, Loaders, Scheduler, Dynamics,
     configure_hacks, configure_workdir,
@@ -30,10 +31,10 @@ from netharn.device import (XPU,)
 from netharn.fit_harn import (FitHarn,)
 from netharn.hyperparams import (HyperParams,)
 from netharn.monitor import (Monitor,)
-from netharn.output_shape_for import (OutputShapeFor, OutputShape,
-                                      HiddenShapes)
-from netharn.receptive_field_for import (ReceptiveFieldFor, ReceptiveField,
-                                         HiddenFields)
+from netharn.analytic.output_shape_for import (
+    OutputShapeFor, OutputShape, HiddenShapes)
+from netharn.analytic.receptive_field_for import (
+    ReceptiveFieldFor, ReceptiveField, HiddenFields)
 
 __extra_all__ = [
     'Initializer',
@@ -62,7 +63,6 @@ __extra_all__ = [
 
 ## AUTOGENERATED AFTER THIS POINT
 # <AUTOGEN_INIT>
-from netharn import analytic_for
 from netharn import api
 from netharn import criterions
 from netharn import data
@@ -78,11 +78,12 @@ from netharn import mixins
 from netharn import models
 from netharn import monitor
 from netharn import optimizers
-from netharn import output_shape_for
 from netharn import prefit
-from netharn import receptive_field_for
 from netharn import schedulers
 from netharn import util
+from netharn.analytic import analytic_for
+from netharn.analytic import output_shape_for
+from netharn.analytic import receptive_field_for
 
 __all__ = ['Criterion', 'Dynamics', 'FitHarn', 'HiddenFields', 'HiddenShapes',
            'HyperParams', 'Initializer', 'Initializer', 'Loaders', 'Monitor',
diff --git a/netharn/analytic/__init__.py b/netharn/analytic/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/netharn/analytic/analytic_for.py b/netharn/analytic/analytic_for.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c413eb360cde22e31b575e4475047a8cdbe46d8
--- /dev/null
+++ b/netharn/analytic/analytic_for.py
@@ -0,0 +1,140 @@
+"""
+Code for commonalities between "X for" objects that compute analytic properties
+of networks like OutputShapeFor and ReceptiveFieldFor
+"""
+import ubelt as ub
+from collections import OrderedDict
+
+
+class Hidden(OrderedDict, ub.NiceRepr):
+    """ Object for storing hidden states of analystic computation """
+
+    def __nice__(self):
+        return ub.repr2(self, nl=0)
+
+    def __str__(self):
+        return ub.NiceRepr.__str__(self)
+
+    def __repr__(self):
+        return ub.NiceRepr.__repr__(self)
+
+    def __setitem__(self, key, value):
+        if getattr(value, 'hidden', None) is not None:
+            # When setting a value to an OutputShape object, if that object has
+            # a hidden shape, then use that instead.
+            value = value.hidden
+        return OrderedDict.__setitem__(self, key, value)
+
+    def shallow(self, n=1):
+        """
+        Grabs only the shallowest n layers of hidden shapes
+        """
+        if n == 0:
+            last = self
+            while hasattr(last, 'shallow'):
+                values = list(last.values())
+                if len(values):
+                    last = values[-1]
+                else:
+                    break
+            return last
+        else:
+            output = OrderedDict()
+            for key, value in self.items():
+                # if isinstance(value, HiddenShapes):
+                if hasattr(value, 'shallow'):
+                    value = value.shallow(n - 1)
+                output[key] = value
+            return output
+
+
+class OutputFor(object):
+    """
+    Analytic base / identity class
+    """
+    def __init__(self, func):
+        self.func = func
+
+    def __call__(self, *args, **kw):
+        return self.func(*args, **kw)
+
+
+class Output(object):
+    """
+    Analytic base / identity class
+    """
+    @classmethod
+    def coerce(cls, data=None, hidden=None):
+        return data
+
+
+class ForwardFor(OutputFor):
+    """
+    Analytic version of forward functions
+    """
+    def __init__(self, func):
+        self.func = func
+
+    def __call__(self, *args, **kw):
+        return self.func(*args, **kw)
+
+    @staticmethod
+    def getitem(arr):
+        """
+        Wraps getitem calls
+
+        Example:
+            >>> import torch
+            >>> arr = torch.rand(2, 16, 2, 2)
+            >>> result = ForwardFor.getitem(arr)[:, 0:4]
+            >>> assert result.shape == (2, 4, 2, 2)
+        """
+        return _ForwardGetItem(arr)
+
+    @staticmethod
+    def view(arr, *args):
+        """
+        Wraps view calls
+
+        Example:
+            >>> import torch
+            >>> arr = torch.rand(2, 16, 2, 2)
+            >>> result = ForwardFor.view(arr, -1)
+        """
+        return arr.view(*args)
+
+    @staticmethod
+    def shape(arr):
+        """
+        Wraps shape calls
+
+        Example:
+            >>> import torch
+            >>> arr = torch.rand(2, 16, 2, 2)
+            >>> result = ForwardFor.shape(arr)
+        """
+        return arr.shape
+
+    @staticmethod
+    def add(arr1, arr2):
+        return arr1 + arr2
+
+    @staticmethod
+    def mul(arr1, arr2):
+        return arr1 * arr2
+
+    @staticmethod
+    def sub(arr1, arr2):
+        return arr1 - arr2
+
+    @staticmethod
+    def div(arr1, arr2):
+        return arr1 - arr2
+
+
+class _ForwardGetItem(object):
+    def __init__(self, inp):
+        self.inp = inp
+
+    def __getitem__(self, slices):
+        return self.inp.__getitem__(slices)
diff --git a/netharn/analytic/output_shape_for.py b/netharn/analytic/output_shape_for.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8398b07525f4d16ac40681346a162d8c6c7feb4
--- /dev/null
+++ b/netharn/analytic/output_shape_for.py
@@ -0,0 +1,1199 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+import ubelt as ub
+import math
+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+import torchvision
+from collections import OrderedDict
+from six.moves import builtins
+from netharn.analytic import analytic_for
+# try:
+from netharn.device import DataSerial
+# except ImportError:
+#     DataSerial = None
+
+REGISTERED_TYPES = []
+
+
+SHAPE_CLS = tuple  # We exepct shapes to be specified as this class
+
+
+def compute_type(*types):
+    def _wrap(func):
+        for type in types:
+            if type is not None:
+                REGISTERED_TYPES.append((type, func))
+        return func
+    return _wrap
+
+
+def output_shape_of(outputs):
+    """
+    Given a network output, try and find the shape. Works in most standard
+    cases, but not all cases.
+
+    Args:
+        outputs (Tensor | Dict | Tuple): some typical torch network output
+
+    Example:
+        >>> output_shape_of(torch.empty(3, 2))
+        (3, 2)
+        >>> output_shape_of({'a': torch.empty(3, 2)})
+        {'a': (3, 2)}
+        >>> output_shape_of(((torch.empty(3, 2),),))
+        [[(3, 2)]]
+    """
+    if torch.is_tensor(outputs):
+        computed_output_shape = SHAPE_CLS(outputs.shape)
+    elif isinstance(outputs, dict):
+        dict_cls = outputs.__class__  # handle odict
+        computed_output_shape = dict_cls([
+            (k, output_shape_of(v)) for k, v in outputs.items()])
+    elif isinstance(outputs, tuple):
+        # Allow outputs to be a tuple of tensors
+        computed_output_shape = [output_shape_of(o) for o in outputs]
+    else:
+        raise TypeError('Cannot find shape of {!r}'.format(type(outputs)))
+    return computed_output_shape
+
+
+def _brute_force_output_shape_for(self, input_shape):
+    """
+    Computes output shape by actually running the network. Works in most
+    standard cases, but not all cases. If the batch size is None, we attempt to
+    be smart about ensuring that that None is propogated in the output.
+
+    Example:
+        >>> module = nn.Conv2d(3, 11, 3, 1, 0)
+        >>> _brute_force_output_shape_for(module, (None, 3, 256, 256))
+        (None, 11, 254, 254)
+    """
+    _input_shape = list(input_shape)
+    unknown_bsize = _input_shape[0] is None
+    if unknown_bsize:
+        bsize = 2
+        _input_shape[0] = bsize
+    device = next(iter(self.state_dict().values())).device
+    dummy_input = torch.rand(*_input_shape).to(device)
+    dummy_output = self(dummy_input)
+    output_shape = output_shape_of(dummy_output)
+    if torch.is_tensor(dummy_output):
+        if unknown_bsize:
+            if output_shape[0] == bsize:
+                output_shape = list(output_shape)
+                output_shape[0] = None
+        output_shape = SHAPE_CLS(output_shape)
+    else:
+        raise NotImplementedError('other output types')
+    return output_shape
+
+
+def _simplify(shape):
+    import sympy
+    if isinstance(shape, (tuple, list)):
+        shape = shape.__class__([_simplify(v) for v in shape])
+    elif isinstance(shape, dict):
+        shape = shape.__class__([(k, _simplify(v)) for k, v in shape.items()])
+    elif isinstance(shape, sympy.Expr):
+        shape = sympy.simplify(shape)
+    return shape
+
+
+class HiddenShapes(analytic_for.Hidden):
+    """
+    Augments normal hidden shape dicts with a convinience setitem
+
+    Doctest:
+        >>> from netharn.analytic.output_shape_for import *
+        >>> shape = OutputShape.coerce([None, 3, 32, 32], 'foo')
+        >>> print(HiddenShapes({'e': shape}))
+        <HiddenShapes({'e': 'foo'})>
+        >>> hidden = HiddenShapes({'a': 1})
+        >>> hidden['b'] = 2
+        >>> hidden['c'] = shape
+        >>> print(hidden)
+        <HiddenShapes({'a': 1, 'b': 2, 'c': 'foo'})>
+    """
+    pass
+
+
+# class HiddenShapes(OrderedDict, ub.NiceRepr):
+#     """
+#     Augments normal hidden shape dicts with a convinience setitem
+
+#     Doctest:
+#         >>> from netharn.analytic.output_shape_for import *
+#         >>> shape = OutputShape.coerce([None, 3, 32, 32], 'foo')
+#         >>> print(HiddenShapes({'e': shape}))
+#         <HiddenShapes({'e': 'foo'})>
+#         >>> hidden = HiddenShapes({'a': 1})
+#         >>> hidden['b'] = 2
+#         >>> hidden['c'] = shape
+#         >>> print(hidden)
+#         <HiddenShapes({'a': 1, 'b': 2, 'c': 'foo'})>
+#     """
+#     def __nice__(self):
+#         return ub.repr2(self, nl=0)
+
+#     def __str__(self):
+#         return ub.NiceRepr.__str__(self)
+
+#     def __repr__(self):
+#         return ub.NiceRepr.__repr__(self)
+
+#     def __setitem__(self, key, value):
+#         if getattr(value, 'hidden', None) is not None:
+#             # When setting a value to an OutputShape object, if that object has
+#             # a hidden shape, then use that instead.
+#             value = value.hidden
+#         return OrderedDict.__setitem__(self, key, value)
+
+#     def shallow(self, n=1):
+#         """
+#         Grabs only the shallowest n layers of hidden shapes
+#         """
+#         if n == 0:
+#             last = self
+#             while isinstance(last, HiddenShapes):
+#                 values = list(last.values())
+#                 if len(values):
+#                     last = values[-1]
+#                 else:
+#                     break
+#             return last
+#         else:
+#             output = OrderedDict()
+#             for key, value in self.items():
+#                 # if isinstance(value, HiddenShapes):
+#                 if hasattr(value, 'shallow'):
+#                     value = value.shallow(n - 1)
+#                 output[key] = value
+#             return output
+
+
+class OutputShape(analytic_for.Output):
+    """
+    Mixin class to extend output shapes with extra information
+
+    Doctest:
+        >>> from netharn.analytic.output_shape_for import *
+        >>> shape = OutputShape.coerce([None, 3, 32, 32], 'foo')
+        >>> print('shape = {!r}'.format(shape))
+        shape = (None, 3, 32, 32)
+        >>> print('shape.hidden = {!r}'.format(shape.hidden))
+        shape.hidden = 'foo'
+    """
+    def __init__(self, data=None, hidden=None):
+        self.data = data
+        self.hidden = hidden
+
+    @classmethod
+    def template(cls, type):
+        """ Get a specific template for a subclass type """
+        if type is tuple:
+            return OutputShapeTuple
+        elif type is OrderedDict:
+            return OutputShapeDict
+        elif type is dict:
+            return OutputShapeDict
+        else:
+            raise TypeError(type)
+
+    @classmethod
+    def coerce(cls, data=None, hidden=None):
+        """
+        Create an OutputShape instance of the approriate subclass given the
+        type of input data.
+        """
+        if isinstance(data, cls):
+            if hidden is None:
+                self = data
+            else:
+                self = data.__class__(data, hidden)
+        elif isinstance(data, (tuple, list)):
+            self = cls.template(tuple)(data, hidden)
+        elif isinstance(data, dict):
+            self = cls.template(dict)(data, hidden)
+        else:
+            raise TypeError(type(data))
+        return self
+
+
+class OutputShapeTuple(tuple, OutputShape):
+    """ OutputShape templated as a tuple """
+    def __new__(cls, data=None, hidden=None):
+        # tuple subclass is a bit weird
+        if data is None:
+            data = tuple()
+        self = tuple.__new__(OutputShapeTuple, data)
+        OutputShape.__init__(self, data, hidden)
+        return self
+
+
+class OutputShapeDict(OrderedDict, OutputShape):
+    """ OutputShape templated as a dictionary """
+    def __init__(self, data=None, hidden=None):
+        if data is None:
+            data = OrderedDict()
+        OrderedDict.__init__(self, data)
+        OutputShape.__init__(self, data, hidden)
+
+
+class OutputShapeFor(analytic_for.OutputFor):
+    """
+    Compute the output shape for standard torch modules as well as
+    any custom modules that follow the OutputShapeFor protocol.
+
+    Notes:
+        The OutputShapeFor protocol is simple. For any custom torch module
+        define the method `output_shape_for(self, input_shape)`, which is
+        typically written to mirror the `forward` function. Instead of calling
+        forward on the custom module's torch members use `OutputShapeFor`. See
+        netharn.layers for more examples of custom layers that implement this
+        protocol. A simple example is shown below.
+
+    Example:
+        >>> # Example showing how to implement the OutputShapeFor protocol
+        >>> class MyCustomNet(nn.Module):
+        >>>     def __init__(self):
+        >>>         super(MyCustomNet, self).__init__()
+        >>>         self.conv1 = nn.Conv2d(1, 5, 3)
+        >>>         self.pool1 = nn.MaxPool2d(2)
+        >>>         self.conv2 = nn.Conv2d(5, 7, 3)
+        >>>     def forward(self, input):
+        >>>         x = input
+        >>>         x = self.conv1(x)
+        >>>         x = self.pool1(x)
+        >>>         x = self.conv2(x)
+        >>>         return x
+        >>>     def output_shape_for(self, input_shape):
+        >>>         x = input_shape
+        >>>         # Note using hidden shapes is optional, but sometimes useful
+        >>>         hidden = HiddenShapes()
+        >>>         # The basic idea is to simply mirror the forward func
+        >>>         # but instead of calling the modules use output shape for
+        >>>         hidden['conv1'] = x = OutputShapeFor(self.conv1)(x)
+        >>>         hidden['pool1'] = x = OutputShapeFor(self.pool1)(x)
+        >>>         hidden['conv2'] = x = OutputShapeFor(self.conv2)(x)
+        >>>         shape = OutputShape.coerce(x, hidden)
+        >>>         return shape
+        >>> net = MyCustomNet()
+        >>> # Now it is very easy and efficient to infer the output shape
+        >>> input_shape = (None, 1, 9, 9)
+        >>> net.output_shape_for(input_shape)
+        (None, 7, 1, 1)
+        >>> # The OutputShapeFor class now recognizes your module as well
+        >>> # so it can be used to constuct more complex modules while
+        >>> # still maintaining the ability fo infer the output shape.
+        >>> OutputShapeFor(net)(input_shape)
+        (None, 7, 1, 1)
+        >>> # Note that if you did return an true OutputShape object with
+        >>> # a populated hidden shape attribute, then you can access it
+        >>> # to inspect how the shape changes in the hidden layer of the net
+        >>> print(OutputShapeFor(net)(input_shape).hidden)
+        <HiddenShapes({'conv1': (None, 5, 7, 7), 'pool1': (None, 5, 3, 3), 'conv2': (None, 7, 1, 1)})>
+
+    Example:
+        >>> # Example showing how this class is used on basic torch Modules
+        >>> module = nn.Conv2d(3, 11, 3, 1, 0)
+        >>> OutputShapeFor(module)((1, 3, 256, 256))
+        (1, 11, 254, 254)
+    """
+    math = math  # for hacking in sympy
+
+    def __init__(self, module, force=False):
+        """
+        Args:
+            module (nn.Module) : module with output_shape_for func or
+                with some known registered type (e.g. torch.nn.Conv2d).
+
+            force (bool): if True and no implicit computation is known
+                try to create a dummy input with input_shape and simply
+                run it through the network to see what shape it produces.
+                (Defaults to False).
+        """
+        self._requires_force = False
+        self.module = module
+        # First try to lookup the output_shape_for func
+        self._func = getattr(module, 'output_shape_for', None)
+
+        if self._func is None:
+            # Lookup shape func if we can't find it
+            found = []
+            for type, _func in REGISTERED_TYPES:
+                try:
+                    if module is type or isinstance(module, type):
+                        found.append(_func)
+                except TypeError:
+                    pass
+            if len(set(found)) == 1:
+                self._func = found[0]
+            elif len(found) == 0:
+                raise TypeError('Unknown (output_shape) module type {}'.format(module))
+            else:
+                raise AssertionError('Ambiguous (output_shape) module {}. Found {}'.format(module, found))
+
+    def __call__(self, *args, **kwargs):
+        if isinstance(self.module, nn.Module):
+            # bound methods dont need module
+            is_bound  = hasattr(self._func, '__func__') and getattr(self._func, '__func__', None) is not None
+            is_bound |= hasattr(self._func, 'im_func') and getattr(self._func, 'im_func', None) is not None
+            if is_bound:
+                output_shape = self._func(*args, **kwargs)
+            else:
+                # nn.Module with state
+                output_shape = self._func(self.module, *args, **kwargs)
+        else:
+            # a simple pytorch func
+            output_shape = self._func(*args, **kwargs)
+
+        # Package the output shape up in the appropriate wrapper class
+        output_shape = OutputShape.coerce(output_shape)
+        # if self.math.__name__ == 'sympy':
+        #     output_shape = _simplify(output_shape)
+        # debug = True
+        # if debug:
+        #     print('{}.output_shape = {}'.format(str(self._func.__name__), output_shape))
+        return output_shape
+
+    def _check_consistency(self, input_shape, **kwargs):
+        """
+        Test function to check that expected shape is equal to computed shape.
+        The kwargs are passed to both output_shape_for and forward, so ensure
+        that both functions accept the same arguments.
+        """
+        # Run the output shape computation
+        expected = self(input_shape, **kwargs)
+
+        if isinstance(expected, OutputShape):
+            expected_output_shape = expected.data
+        else:
+            expected_output_shape = expected
+
+        # Create dummy inputs and send them through the network
+        inputs = torch.randn(input_shape)
+        with torch.no_grad():
+            self.module.eval()
+            outputs = self.module(inputs, **kwargs)
+
+        if isinstance(outputs, dict):
+            if not isinstance(expected_output_shape, dict):
+                raise AssertionError((
+                    'if outputs is a dict, then output_shape must also be '
+                    'a corresponding dict. Instead we got: '
+                    'type(outputs)={} '
+                    'type(expected_output_shape)={} '
+                ).format(type(outputs), type(expected_output_shape)))
+        computed_output_shape = output_shape_of(outputs)
+
+        if computed_output_shape != expected_output_shape:
+            print('expected_output_shape = {}'.format(ub.repr2(expected_output_shape, nl=0)))
+            print('computed_output_shape = {}'.format(ub.repr2(computed_output_shape, nl=0)))
+            raise AssertionError(
+                'computed shape {!r} != expected shape {!r}'.format(
+                    computed_output_shape,
+                    expected_output_shape,
+                )
+            )
+        return expected_output_shape
+
+    @staticmethod
+    @compute_type(nn.Upsample)
+    def Upsample(module, input_shape):
+        r"""
+        - Input: :math:`(N, C, H_{in}, W_{in})`
+        - Output: :math:`(N, C, H_{out}, W_{out})` where
+            :math:`H_{out} = floor(H_{in} * scale\_factor)`
+            :math:`W_{out} = floor(W_{in}  * scale\_factor)`
+
+        Example:
+            >>> # xdoctest: +SKIP
+            >>> # There is a torch bug in 1.1.0 that prevents this from working
+            >>> from netharn.analytic.output_shape_for import *
+            >>> input_shape = (1, 3, 256, 256, 256)
+            >>> module = nn.Upsample(scale_factor=(2, 3, 4))
+            >>> output_shape = OutputShapeFor(module)(input_shape)
+            >>> print('output_shape = {!r}'.format(output_shape))
+            output_shape = (1, 3, 512, 768, 1024)
+            >>> module = nn.Upsample(size=100)
+            >>> output_shape = OutputShapeFor(module)(input_shape)
+            >>> print('output_shape = {!r}'.format(output_shape))
+            output_shape = (1, 3, 100, 100, 100)
+            >>> input_shape = (1, 3, 256, 256)
+            >>> module = nn.UpsamplingBilinear2d(scale_factor=2)
+            >>> output_shape = OutputShapeFor(module)(input_shape)
+            >>> print('output_shape = {!r}'.format(output_shape))
+            output_shape = (1, 3, 512, 512)
+        """
+        math = OutputShapeFor.math
+        # N, C, *DIMS_in = input_shape
+        N, C = input_shape[0:2]
+        DIMS_in = input_shape[2:]
+
+        if module.size is None:
+            scale_factor = ensure_iterablen(module.scale_factor, len(DIMS_in))
+            int = builtins.int if math.__name__ == 'math' else ub.identity
+            DIMS_out = [
+                int(math.floor(D_in * scale_factor[i]))
+                for i, D_in in enumerate(DIMS_in)
+            ]
+        else:
+            DIMS_out = ensure_iterablen(module.size, len(DIMS_in))
+
+        output_shape = SHAPE_CLS([N, C] + list(DIMS_out))
+        if math.__name__ == 'sympy':
+            output_shape = _simplify(output_shape)
+        return output_shape
+
+    @staticmethod
+    @compute_type(torch.nn.functional.interpolate)
+    def interpolate(input_shape, size=None, scale_factor=None, **kwargs):
+        """
+        Example:
+            >>> from netharn.analytic.output_shape_for import *
+            >>> input_shape = (1, 3, 256, 256)
+            >>> output_shape = OutputShapeFor(torch.nn.functional.interpolate)(input_shape, size=(32, 32))
+            >>> print('output_shape = {!r}'.format(output_shape))
+            output_shape = (1, 3, 32, 32)
+        """
+        math = OutputShapeFor.math
+        # N, C, *DIMS_in = input_shape
+        N, C = input_shape[0:2]
+        DIMS_in = input_shape[2:]
+
+        if size is None:
+            scale_factor = ensure_iterablen(scale_factor, len(DIMS_in))
+            int = builtins.int if math.__name__ == 'math' else ub.identity
+            DIMS_out = [
+                int(math.floor(D_in * scale_factor[i]))
+                for i, D_in in enumerate(DIMS_in)
+            ]
+        else:
+            DIMS_out = ensure_iterablen(size, len(DIMS_in))
+
+        output_shape = SHAPE_CLS([N, C] + list(DIMS_out))
+        if math.__name__ == 'sympy':
+            output_shape = _simplify(output_shape)
+        return output_shape
+
+    @staticmethod
+    @compute_type(nn.ConvTranspose1d)
+    def conv1dT(module, input_shape):
+        return OutputShapeFor.convndT(module, input_shape, 1)
+
+    @staticmethod
+    @compute_type(nn.ConvTranspose2d)
+    def conv2dT(module, input_shape):
+        return OutputShapeFor.convndT(module, input_shape, 2)
+
+    @staticmethod
+    @compute_type(nn.ConvTranspose3d)
+    def conv3dT(module, input_shape):
+        return OutputShapeFor.convndT(module, input_shape, 3)
+
+    @staticmethod
+    @compute_type(nn.Conv1d)
+    def conv1d(module, input_shape):
+        return OutputShapeFor.convnd(module, input_shape, 1)
+
+    @staticmethod
+    @compute_type(nn.Conv2d)
+    def conv2d(module, input_shape):
+        return OutputShapeFor.convnd(module, input_shape, 2)
+
+    @staticmethod
+    @compute_type(nn.ZeroPad2d)
+    def zeropad2d(module, input_shape):
+        r"""
+        Shape:
+            - Input: :math:`(N, C, H_{in}, W_{in})`
+            - Output: :math:`(N, C, H_{out}, W_{out})` where
+
+              :math:`H_{out} = H_{in} + \text{padding\_top} + \text{padding\_bottom}`
+
+              :math:`W_{out} = W_{in} + \text{padding\_left} + \text{padding\_right}`
+
+        Example:
+            >>> module = nn.ZeroPad2d([2, 3, 5, 7])
+            >>> input_shape = (1, 3, 5, 7)
+            >>> out = OutputShapeFor(module)(input_shape)
+            >>> out_want = module(torch.zeros(*input_shape)).shape
+            >>> assert out == tuple(out_want)
+        """
+        return OutputShapeFor.pad(input_shape, module.padding)
+
+    @staticmethod
+    @compute_type(F.conv2d)
+    def f_conv2d(inputs, weight, bias=None, stride=1, padding=0,
+                 dilation=1, groups=1):
+        """
+        Example:
+            >>> x = inputs = (1, 124, 226, 226)
+            >>> module = nn.Conv2d(128, 64, kernel_size=(3, 5), groups=8)
+            >>> weight = module.weight
+            >>> bias = module.bias is not None
+            >>> stride = module.stride
+            >>> padding = module.padding
+            >>> dilation = module.dilation
+            >>> groups = module.groups
+            >>> y = OutputShapeFor(F.conv2d)(x, weight, bias, stride, padding,
+            >>>                              dilation, groups)
+            >>> print(y)
+            >>> y2 = OutputShapeFor(module)(x)
+            >>> assert y == y2
+
+            >>> weight = torch.rand(3, 2, 5, 5)
+            >>> OutputShapeFor(F.conv2d)((1, 3, 7, 7), weight)
+        """
+        out_channels, in_channels, kernel_h, kernel_w = weight.shape
+        kernel = (kernel_h, kernel_w)
+        module = nn.Conv2d(in_channels * groups, out_channels, kernel,
+                           bias=bias, stride=stride, padding=padding,
+                           dilation=dilation, groups=groups)
+        return OutputShapeFor.convnd(module, inputs, 2)
+
+    @staticmethod
+    @compute_type(nn.Conv3d)
+    def conv3d(module, input_shape):
+        return OutputShapeFor.convnd(module, input_shape, 3)
+
+    @staticmethod
+    @compute_type(nn.MaxPool1d)
+    def maxpool1d(module, input_shape):
+        return OutputShapeFor.maxpoolnd(module, input_shape, 1)
+
+    @staticmethod
+    @compute_type(nn.MaxPool2d)
+    def maxpool2d(module, input_shape):
+        return OutputShapeFor.maxpoolnd(module, input_shape, 2)
+
+    @staticmethod
+    @compute_type(nn.MaxPool3d)
+    def maxpool3d(module, input_shape):
+        return OutputShapeFor.maxpoolnd(module, input_shape, 3)
+
+    @staticmethod
+    @compute_type(nn.AvgPool1d)
+    def avepool1d(module, input_shape):
+        return OutputShapeFor.avepoolnd(module, input_shape, 1)
+
+    @staticmethod
+    @compute_type(nn.AvgPool2d)
+    def avepool2d(module, input_shape):
+        return OutputShapeFor.avepoolnd(module, input_shape, 2)
+
+    @staticmethod
+    @compute_type(nn.AvgPool3d)
+    def avepool3d(module, input_shape):
+        return OutputShapeFor.avepoolnd(module, input_shape, 3)
+
+    @staticmethod
+    @compute_type(nn.modules.pooling._AdaptiveMaxPoolNd, nn.modules.pooling._AdaptiveAvgPoolNd)
+    def adaptive_poolnd(module, input_shape):
+        """
+        Adaptive pooling is easy because the output-shape is known a-priori
+        """
+        B, C = input_shape[0:2]
+        in_dims = input_shape[2:]
+
+        n = len(in_dims)
+        output_dims = ensure_iterablen(module.output_size, n)
+        for i, d in enumerate(output_dims):
+            if d is None:
+                output_dims[i] = in_dims[i]
+
+        output_shape = SHAPE_CLS([B, C] + list(output_dims))
+        return output_shape
+
+    @staticmethod
+    def convndT(module, input_shape, n):
+        r"""
+        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
+            :math:`H_{out} = (H_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0] + output\_padding[0]`
+            :math:`W_{out} = (W_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1] + output\_padding[1]`
+
+        Example:
+            >>> from netharn.analytic.output_shape_for import *
+            >>> input_shape = (1, 3, 256, 256)
+            >>> module = nn.ConvTranspose2d(input_shape[1], 11, kernel_size=2, stride=2)
+            >>> output_shape = OutputShapeFor(module)(input_shape)
+            >>> print('output_shape = {!r}'.format(output_shape))
+            output_shape = (1, 11, 512, 512)
+
+        Example:
+            >>> from netharn.analytic.output_shape_for import *
+            >>> input_shape = (1, 3, 25, 32, 32)
+            >>> module = nn.Conv3d(in_channels=input_shape[1], out_channels=11,
+            >>>                    kernel_size=(3, 3, 3), stride=1, padding=0,
+            >>>                    dilation=1, groups=1, bias=True)
+            >>> output_shape = OutputShapeFor(module)(input_shape)
+            >>> print('output_shape = {!r}'.format(output_shape))
+            output_shape = (1, 11, 23, 30, 30)
+        """
+        # N, C_in, *DIMS_in = input_shape
+        N, C_in = input_shape[0:2]
+        DIMS_in = input_shape[2:]
+
+        if len(DIMS_in) != n:
+            raise ValueError('must have {} dims, but got {} '.format(n, len(DIMS_in)))
+
+        C_out = module.out_channels
+        stride = module.stride
+        kernel_size = module.kernel_size
+        output_padding = module.output_padding
+        dilation = module.dilation
+
+        padding = module.padding
+        DIMS_out = [
+            # Fix the docs: https://github.com/pytorch/pytorch/issues/14099
+            (D_in - 1) * stride[i] - 2 * padding[i] + (kernel_size[i] - 1) * dilation[i] + output_padding[i] + 1
+            for i, D_in in enumerate(DIMS_in)
+        ]
+        output_shape = SHAPE_CLS([N, C_out] + DIMS_out)
+        if math.__name__ == 'sympy':
+            output_shape = _simplify(output_shape)
+        return output_shape
+
+    @staticmethod
+    def convnd(module, input_shape, n):
+        r"""
+        Notes:
+            - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
+            - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
+                :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)`
+                :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)`
+
+        Example:
+            >>> from netharn.analytic.output_shape_for import *
+            >>> input_shape = (1, 3, 256, 256)
+            >>> module = nn.Conv2d(input_shape[1], 11, 3, 1, 0)
+            >>> output_shape = OutputShapeFor(module)(input_shape)
+            >>> print('output_shape = {!r}'.format(output_shape))
+            output_shape = (1, 11, 254, 254)
+        """
+        math = OutputShapeFor.math
+        # N, C_in, *DIMS_in = input_shape
+        N, C_in = input_shape[0:2]
+        DIMS_in = input_shape[2:]
+
+        if len(DIMS_in) != n:
+            raise ValueError('must have {} dims, but got {} '.format(n, len(DIMS_in)))
+
+        C_out = module.out_channels
+        padding = module.padding
+        stride = module.stride
+        dilation = module.dilation
+        kernel_size = module.kernel_size
+
+        int = builtins.int if math.__name__ == 'math' else ub.identity
+        DIMS_out = [
+            int(math.floor(
+                (D_in + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1
+            ))
+            for i, D_in in enumerate(DIMS_in)
+        ]
+        output_shape = SHAPE_CLS([N, C_out] + DIMS_out)
+        if math.__name__ == 'sympy':
+            output_shape = _simplify(output_shape)
+        return output_shape
+
+    @staticmethod
+    def maxpoolnd(module, input_shape, n):
+        r"""
+        CommandLine:
+            python -m xdoctest netharn.analytic.output_shape_for OutputShapeFor.maxpoolnd:0
+
+        Example:
+            >>> from netharn.analytic.output_shape_for import *
+            >>> input_shape = (1, 3, 256, 256)
+            >>> module = nn.MaxPool2d(kernel_size=2, stride=2)
+            >>> output_shape = tuple(OutputShapeFor(module)(input_shape))
+            >>> print('output_shape = {!r}'.format(output_shape))
+            output_shape = (1, 3, 128, 128)
+
+        Example:
+            >>> from netharn.analytic.output_shape_for import *
+            >>> input_shape = (1, 512, 37, 37)
+            >>> module = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
+            >>> output_shape = tuple(OutputShapeFor(module)(input_shape))
+            >>> print('output_shape = {!r}'.format(output_shape))
+            output_shape = (1, 512, 19, 19)
+
+        Shape:
+            2d Case:
+            Same as conv2 forumla except C2 = C1
+            - Input: :math:`(N, C, H_{in}, W_{in})`
+            - Output: :math:`(N, C, H_{out}, W_{out})` where
+            :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)`
+            :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)`
+        """
+        math = OutputShapeFor.math
+        # N, C, *DIMS_in = input_shape
+        N, C = input_shape[0:2]
+        DIMS_in = input_shape[2:]
+
+        padding = ensure_iterablen(module.padding, n)
+        stride = ensure_iterablen(module.stride, n)
+        dilation = ensure_iterablen(module.dilation, n)
+        kernel_size = ensure_iterablen(module.kernel_size, n)
+
+        trunc = math.ceil if module.ceil_mode else math.floor
+
+        int = builtins.int if math.__name__ == 'math' else ub.identity
+
+        DIMS_out = [
+            int(trunc((D_in  + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1))
+            for i, D_in in enumerate(DIMS_in)
+        ]
+        output_shape = SHAPE_CLS([N, C] + DIMS_out)
+        if math.__name__ == 'sympy':
+            output_shape = _simplify(output_shape)
+        return output_shape
+
+    @staticmethod
+    def avepoolnd(module, input_shape, n):
+        r"""
+        2D case:
+          Shape:
+              - Input: :math:`(N, C, H_{in}, W_{in})`
+              - Output: :math:`(N, C, H_{out}, W_{out})` where
+                :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
+                :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
+        """
+        math = OutputShapeFor.math
+        # N, C, *DIMS_in = input_shape
+        N, C = input_shape[0:2]
+        DIMS_in = input_shape[2:]
+
+        padding = ensure_iterablen(module.padding, n)
+        stride = ensure_iterablen(module.stride, n)
+        kernel_size = ensure_iterablen(module.kernel_size, n)
+
+        int = builtins.int if math.__name__ == 'math' else ub.identity
+
+        DIMS_out = [
+            int(math.floor((D_in + 2 * padding[i] - kernel_size[i]) / stride[i] + 1))
+            for i, D_in in enumerate(DIMS_in)
+        ]
+        output_shape = SHAPE_CLS([N, C] + DIMS_out)
+        if math.__name__ == 'sympy':
+            output_shape = _simplify(output_shape)
+        return output_shape
+
+    @staticmethod
+    @compute_type(nn.Linear)
+    def linear(module, input_shape):
+        r"""
+           Shape:
+               - Input: :math:`(N, *, in\_features)` where `*` means any number of
+                 additional dimensions
+               - Output: :math:`(N, *, out\_features)` where all but the last dimension
+                 are the same shape as the input.
+        """
+        # N, *other, in_feat = input_shape
+        N = input_shape[0]
+        other = input_shape[1:-1]
+        in_feat = input_shape[-1]  # NOQA
+
+        output_shape = [N] + list(other) + [module.out_features]
+        return SHAPE_CLS(output_shape)
+
+    @staticmethod
+    def identity(input_shape):
+        return SHAPE_CLS(input_shape)
+
+    @staticmethod
+    @compute_type(nn.functional.relu)
+    def relu_func(input_shape):
+        return SHAPE_CLS(input_shape)
+
+    @staticmethod
+    @compute_type(nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d,
+                  nn.modules.normalization.GroupNorm,
+                  nn.modules.normalization.LocalResponseNorm,
+                  nn.modules.normalization.LayerNorm, nn.CrossMapLRN2d,
+                  nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d)
+    def normalization(module, input_shape):
+        """
+            import redbaron
+            import torch
+            source = open(torch.nn.modules.instancenorm.__file__, 'r').read()
+            baron = redbaron.RedBaron(source)
+            classes = [item.name for item in baron if item.type == 'class']
+            print(', '.join(['nn.{}'.format(c) for c in classes]))
+
+            source = open(torch.nn.modules.normalization.__file__, 'r').read()
+            baron = redbaron.RedBaron(source)
+            classes = [item.name for item in baron if item.type == 'class']
+            print(', '.join(['nn.{}'.format(c) for c in classes]))
+        """
+        return OutputShapeFor.identity(input_shape)
+
+    @staticmethod
+    @compute_type(nn.Dropout, nn.Dropout2d, nn.Dropout3d, nn.AlphaDropout,
+                  nn.FeatureAlphaDropout)
+    def dropout(module, input_shape):
+        return OutputShapeFor.identity(input_shape)
+
+    @staticmethod
+    @compute_type(nn.Threshold, nn.RReLU, nn.Hardtanh, nn.ReLU6, nn.ReLU,
+                  nn.Sigmoid, nn.Tanh, nn.ELU, nn.CELU, nn.SELU, nn.GLU,
+                  nn.Hardshrink, nn.LeakyReLU, nn.LogSigmoid, nn.Softplus,
+                  nn.Softshrink, nn.PReLU, nn.Softsign, nn.Tanhshrink,
+                  nn.Softmin, nn.Softmax, nn.Softmax2d, nn.LogSoftmax)
+    def nonlinearity(module, input_shape):
+        r"""
+        Ignore:
+            import redbaron
+            import torch
+            source = open(torch.nn.modules.activation.__file__, 'r').read()
+            baron = redbaron.RedBaron(source)
+            classes = [item.name for item in baron if item.type == 'class']
+            print(', '.join(['nn.{}'.format(c) for c in classes]))
+        """
+        return OutputShapeFor.identity(input_shape)
+
+    @staticmethod
+    @compute_type(nn.Sequential)
+    def sequential(module, input_shape):
+        """
+        CommandLine:
+            xdoctest -m netharn.analytic.output_shape_for OutputShapeFor.sequential
+
+        Example:
+            >>> from netharn.analytic.output_shape_for import *
+            >>> self = nn.Sequential(
+            >>>     nn.Conv2d(2, 3, kernel_size=3),
+            >>>     nn.Conv2d(3, 5, kernel_size=3),
+            >>>     nn.Conv2d(5, 7, kernel_size=3),
+            >>> )
+            >>> shape = OutputShapeFor(self)([1, 1, 7, 11])
+            >>> print('shape = {}'.format(ub.repr2(shape, nl=0)))
+            >>> print('shape.hidden = {}'.format(ub.repr2(shape.hidden, nl=1)))
+            shape = (1, 7, 1, 5)
+            shape.hidden = {
+                '0': (1, 3, 5, 9),
+                '1': (1, 5, 3, 7),
+                '2': (1, 7, 1, 5),
+            }
+        """
+        hidden = HiddenShapes()
+        shape = input_shape
+        for key, child in module._modules.items():
+            hidden[key] = shape = OutputShapeFor(child)(shape)
+        shape = OutputShape.coerce(shape, hidden=hidden)
+        return shape
+
+    @staticmethod
+    @compute_type(torchvision.models.resnet.BasicBlock)
+    def resent_basic_block(module, input_shape):
+        residual_shape = input_shape
+        shape = input_shape
+
+        hidden = HiddenShapes()
+        hidden['conv1'] = shape = OutputShapeFor(module.conv1)(shape)
+        hidden['bn1']   = shape = OutputShapeFor(module.bn1)(shape)
+        hidden['relu1'] = shape = OutputShapeFor(module.relu)(shape)
+
+        hidden['conv2'] = shape = OutputShapeFor(module.conv2)(shape)
+        hidden['bn2']   = shape = OutputShapeFor(module.bn2)(shape)
+        hidden['relu2'] = shape = OutputShapeFor(module.relu)(shape)
+
+        if module.downsample is not None:
+            residual_shape = OutputShapeFor(module.downsample)(residual_shape)
+            hidden['residual'] = residual_shape
+
+        hidden['join'] = shape
+        assert residual_shape[-2:] == shape[-2:], (
+            'cannot add residual {} {}'.format(residual_shape, shape))
+        shape = OutputShapeFor(module.relu)(shape)
+        hidden['relu3'] = shape
+        shape = OutputShape.coerce(shape, hidden=hidden)
+        return shape
+
+    @staticmethod
+    @compute_type(torchvision.models.resnet.Bottleneck)
+    def resent_bottleneck(module, input_shape):
+        residual_shape = input_shape
+        shape = input_shape
+
+        hidden = HiddenShapes()
+        hidden['conv1'] = shape = OutputShapeFor(module.conv1)(shape)
+        hidden['bn1']   = shape = OutputShapeFor(module.bn1)(shape)
+        hidden['relu1'] = shape = OutputShapeFor(module.relu)(shape)
+
+        hidden['conv2'] = shape = OutputShapeFor(module.conv2)(shape)
+        hidden['bn2']   = shape = OutputShapeFor(module.bn2)(shape)
+        hidden['relu2'] = shape = OutputShapeFor(module.relu)(shape)
+
+        hidden['conv3'] = shape = OutputShapeFor(module.conv3)(shape)
+        hidden['bn3']   = shape = OutputShapeFor(module.bn3)(shape)
+
+        if module.downsample is not None:
+            residual_shape = OutputShapeFor(module.downsample)(input_shape)
+            hidden['residual'] = residual_shape
+
+        assert residual_shape[-2:] == shape[-2:], (
+            'cannot add residual {} {}'.format(residual_shape, shape))
+        hidden['join'] = shape
+
+        shape = OutputShapeFor(module.relu)(shape)
+        hidden['relu3'] = shape
+
+        shape = OutputShape.coerce(shape, hidden=hidden)
+        return shape
+
+    @staticmethod
+    @compute_type(torchvision.models.resnet.ResNet)
+    def resnet_model(module, input_shape):
+        """
+        Example:
+            >>> # xdoctest: +REQUIRES(--network)
+            >>> from netharn.analytic.output_shape_for import *
+            >>> module = torchvision.models.resnet50()
+            >>> input_shape = (1, 3, 224, 224)
+            >>> field = OutputShapeFor(module)(input_shape=input_shape)
+        """
+        shape = input_shape
+
+        hidden = HiddenShapes()
+        hidden['conv1'] = shape = OutputShapeFor(module.conv1)(shape)
+        hidden['bn1'] = shape = OutputShapeFor(module.bn1)(shape)
+        hidden['relu1'] = shape = OutputShapeFor(module.relu)(shape)
+        hidden['maxpool'] = shape = OutputShapeFor(module.maxpool)(shape)
+
+        hidden['layer1'] = shape = OutputShapeFor(module.layer1)(shape)
+        hidden['layer2'] = shape = OutputShapeFor(module.layer2)(shape)
+        hidden['layer3'] = shape = OutputShapeFor(module.layer3)(shape)
+        hidden['layer4'] = shape = OutputShapeFor(module.layer4)(shape)
+
+        hidden['avgpool'] = shape = OutputShapeFor(module.avgpool)(shape)
+
+        def prod(args):
+            result = args[0]
+            for arg in args[1:]:
+                result = result * arg
+            return result
+        shape = (shape[0], prod(shape[1:]))
+        hidden['view'] = shape
+
+        hidden['fc'] = shape = OutputShapeFor(module.fc)(shape)
+        shape = OutputShape.coerce(shape, hidden=hidden)
+        return shape
+
+    @staticmethod
+    @compute_type(nn.functional.adaptive_avg_pool2d)
+    def adaptive_poolnd_func(input_shape, output_shape):
+        """
+        Adaptive pooling is easy because the output-shape is known a-priori
+
+        Example:
+            >>> from netharn.analytic.output_shape_for import *
+            >>> input_shape = (1, 3, 256, 256)
+            >>> output_shape = (7, 7)
+            >>> output_shape_ = OutputShapeFor(nn.functional.adaptive_avg_pool2d)(input_shape, output_shape)
+            >>> print('output_shape = {!r}'.format(output_shape_))
+            output_shape = (1, 3, 7, 7)
+        """
+        B, C = input_shape[0:2]
+        in_dims = input_shape[2:]
+
+        n = len(in_dims)
+        output_dims = ensure_iterablen(output_shape, n)
+        for i, d in enumerate(output_dims):
+            if d is None:
+                output_dims[i] = in_dims[i]
+
+        output_shape_ = SHAPE_CLS([B, C] + list(output_dims))
+        return output_shape_
+
+    @staticmethod
+    @compute_type(torch.sigmoid)
+    def sigmoid(input_shape):
+        return OutputShapeFor.identity(input_shape)
+
+    @staticmethod
+    @compute_type(F.pad)
+    def pad(x, pad, mode='constant', value=0):
+        """
+        Example:
+            >>> t4d = x = (3, 3, 4, 2)
+            >>> pad = p1d = (1, 1)
+            >>> out = OutputShapeFor(F.pad)(x, pad)
+            >>> print(out)
+            (3, 3, 4, 4)
+            >>> p2d = (1, 1, 2, 2) # pad last dim by (1, 1) and 2nd to last by (2, 2)
+            >>> out = OutputShapeFor.pad(t4d, p2d, "constant", 0)
+            >>> print(out)
+            (3, 3, 8, 4)
+            >>> t4d = (3, 3, 4, 2)
+            >>> p3d = (0, 1, 2, 1, 3, 3) # pad by (0, 1), (2, 1), and (3, 3)
+            >>> out = OutputShapeFor.pad(t4d, p3d, "constant", 0)
+            >>> print(out)
+            (3, 9, 7, 3)
+        """
+        new_x = list(x)
+        dim = len(new_x)
+        for idx, dpad in enumerate(ub.chunks(pad, 2), start=1):
+            dimx = dim - idx
+            lpad, rpad = dpad
+            new_x[dimx] = x[dimx] + lpad + rpad
+        out = SHAPE_CLS(new_x)
+        return out
+
+    @staticmethod
+    @compute_type(torch.cat)
+    def cat(input_shapes, dim=0):
+        """
+        Example:
+            >>> from netharn.analytic.output_shape_for import *
+            >>> input_shape1 = (1, 3, 256, 256)
+            >>> input_shape2 = (1, 4, 256, 256)
+            >>> input_shapes = [input_shape1, input_shape2]
+            >>> output_shape = OutputShapeFor(torch.cat)(input_shapes, dim=1)
+            >>> print('output_shape = {!r}'.format(output_shape))
+            output_shape = (1, 7, 256, 256)
+        """
+        n_dims = max(map(len, input_shapes))
+        assert n_dims == min(map(len, input_shapes))
+        output_shape = [None] * n_dims
+        for shape in input_shapes:
+            for i, v in enumerate(shape):
+                if output_shape[i] is None:
+                    output_shape[i] = v
+                else:
+                    if i == dim:
+                        output_shape[i] += v
+                    else:
+                        assert output_shape[i] == v, 'inconsistent dims {}'.format(input_shapes)
+        return SHAPE_CLS(output_shape)
+
+    @staticmethod
+    @compute_type(DataSerial)
+    def data_serial(module, *args, **kw):
+        return OutputShapeFor(module.module)(*args, **kw)
+
+    @staticmethod
+    @compute_type(torch.nn.DataParallel)
+    def data_parallel(module, *args, **kw):
+        return OutputShapeFor(module.module)(*args, **kw)
+
+    @staticmethod
+    def getitem(arr):
+        """
+        Wraps getitem calls
+
+        Example:
+            >>> arr = (2, 32, 9, 9)
+            >>> result = OutputShapeFor.getitem(arr)[:, 0:4]
+            >>> assert result == [2, 4, 9, 9]
+        """
+        return _ShapeGetItem(arr)
+
+    @staticmethod
+    def view(arr, *args):
+        """
+        Wraps view calls
+
+        Example:
+            >>> arr = (2, 32, 9, 9)
+            >>> result = OutputShapeFor.view(arr, -1)
+            >>> assert result == (5184,)
+        """
+        from netharn import layers
+        reshape = layers.Reshape(*args)
+        return reshape.output_shape_for(arr)
+
+    @staticmethod
+    def shape(arr):
+        """
+        Wraps shape calls
+
+        Example:
+            >>> arr = (2, 32, 9, 9)
+            >>> result = OutputShapeFor.shape(arr)
+            >>> assert result == arr
+        """
+        return arr
+
+    @staticmethod
+    def add(arr1, arr2):
+        return _output_shape_broadcast(arr1, arr2)
+
+    @staticmethod
+    def mul(arr1, arr2):
+        return _output_shape_broadcast(arr1, arr2)
+
+    @staticmethod
+    def sub(arr1, arr2):
+        return _output_shape_broadcast(arr1, arr2)
+
+    @staticmethod
+    def div(arr1, arr2):
+        return _output_shape_broadcast(arr1, arr2)
+
+
+def _output_shape_broadcast(arr1, arr2):
+    """
+    Args:
+        arr1 (Tuple | scalar): shape of arr1 or a scalar
+        arr2 (Tuple | scalar): shape of arr2 or a scalar
+    """
+    if not ub.iterable(arr1):
+        return arr2
+    if not ub.iterable(arr2):
+        return arr1
+    if tuple(arr1) != tuple(arr2):
+
+        if len(arr1) == len(arr2):
+            arr3 = []
+            for d1, d2 in zip(arr1, arr2):
+                if d1 is None or d1 < 0:
+                    raise NotImplementedError
+                if d2 is None or d2 < 0:
+                    raise NotImplementedError
+                if d1 == d2:
+                    arr3.append(d1)
+                elif d1 == 1:
+                    arr3.append(d2)
+                elif d2 == 1:
+                    arr3.append(d1)
+                else:
+                    raise ValueError('broadcast seems bad')
+            arr3 = type(arr1)(arr3)
+            return arr3
+
+        # TODO: handle broadcast
+        raise NotImplementedError('Full broadcast not implemented {} != {}'.format(arr1, arr2))
+    return arr1
+
+
+class _ShapeGetItem(object):
+    def __init__(self, inp):
+        self.inp = inp
+
+    def __getitem__(self, slices):
+        ellipsis_type = type(Ellipsis)
+        oup = list(self.inp)
+        if isinstance(slices, slice):
+            slices = (slices,)
+
+        if isinstance(slices, tuple):
+            for i, sl in enumerate(slices):
+                if isinstance(sl, ellipsis_type):
+                    assert i == len(slices) - 1
+                    break
+                start, stop, step = sl.indices(oup[i])
+                oup[i] = (stop - start) // step
+        return oup
+
+
+def ensure_iterablen(scalar, n):
+    try:
+        iter(scalar)
+    except TypeError:
+        return [scalar] * n
+    return scalar
diff --git a/netharn/analytic/receptive_field_for.py b/netharn/analytic/receptive_field_for.py
new file mode 100644
index 0000000000000000000000000000000000000000..471b95c315811597613c75d816a65752fe108536
--- /dev/null
+++ b/netharn/analytic/receptive_field_for.py
@@ -0,0 +1,1128 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+import torch
+import copy
+import six  # NOQA
+import torch.nn as nn
+import torchvision
+import ubelt as ub
+import numpy as np
+from collections import OrderedDict
+from netharn.analytic.output_shape_for import OutputShapeFor
+from netharn.analytic import analytic_for
+# try:
+# from netharn.device import MountedModel
+# except ImportError:
+#     MountedModel = None
+
+REGISTERED_TYPES = []
+
+
+def ensure_array_nd(data, n):
+    if ub.iterable(data):
+        return np.array(data)
+    else:
+        return np.array([data] * n)
+
+
+def compute_type(*types):
+    def _wrap(func):
+        for type in types:
+            if type is not None:
+                REGISTERED_TYPES.append((type, func))
+        return func
+    return _wrap
+
+
+class ReceptiveFieldTypeError(TypeError):
+    pass
+
+
+class ReceptiveField(OrderedDict, analytic_for.Output):
+    """
+    container for holding a receptive feild
+
+    Example:
+        >>> self = ReceptiveField.coerce({
+        >>>     'stride': np.array([4]),
+        >>>     'shape': np.array([1]),
+        >>>     'crop': np.array([0]),
+        >>> })
+        >>> self_copy = copy.deepcopy(self)
+    """
+    def __init__(self, data, hidden=None):
+        # Inheriting from an odict consistently between python 2/3 is weird
+        data2 = OrderedDict(sorted(OrderedDict(data).items()))
+        OrderedDict.__init__(self, data2)
+        self.data = data2
+        self.hidden = hidden
+
+    def __copy__(self):
+        self_copy = ReceptiveField(self.data, self.hidden)
+        return self_copy
+
+    def __deepcopy__(self, memo):
+        data_copy = copy.deepcopy(self.data, memo)
+        hidden_copy = copy.deepcopy(self.hidden, memo)
+        self_copy = ReceptiveField(data_copy, hidden_copy)
+        return self_copy
+
+    @classmethod
+    def coerce(cls, data, hidden=None):
+        """
+        Example:
+            >>> # test weird python2 failure case
+            >>> from netharn.analytic.receptive_field_for import *
+            >>> cls = ReceptiveField
+            >>> data = [(0, ReceptiveFieldFor.input())]
+            >>> self = cls.coerce(data)
+            >>> print(ub.repr2(self, with_dtype=False))
+            {
+                0: {
+                    'crop': np.array([0., 0.]),
+                    'shape': np.array([1., 1.]),
+                    'stride': np.array([1., 1.]),
+                },
+            }
+        """
+        # TODO: make this work like OutputShape
+        if data is None:
+            self = ReceptiveFieldFor.input()
+            self.hidden = hidden
+        elif isinstance(data, cls):
+            if hidden is None:
+                self = data
+            else:
+                self = data.__class__(data, hidden)
+        else:
+            self = cls(data, hidden)
+        return self
+
+    # def __getitem__(self, key):
+    #     return self.data[key]
+
+
+class HiddenFields(analytic_for.Hidden):
+    """
+    Augments normal hidden fields dicts with a convinience setitem
+    """
+    pass
+
+
+class _TorchMixin(object):
+    """
+    Receptive field formulas for PyTorch primatives
+    """
+
+    @staticmethod
+    def input(input_field=None, n=2):
+        """
+        Basic input receptive field is just a single pixel.
+        """
+        if input_field is not None:
+            raise ValueError('nothing can precede the input')
+        input_field = ReceptiveField.coerce({
+            # The input receptive field stride / scale factor is 1.
+            'stride': ensure_array_nd(1.0, n),
+            # The input receptive field shape is 1 pixel.
+            'shape': ensure_array_nd(1.0, n),
+            # Use the coordinate system where the top left corner is 0, 0 ( This is unlike [1], which uses 0.5)
+            'crop': ensure_array_nd(0.0, n),
+        })
+        return input_field
+
+    @staticmethod
+    def _kernelized(module, input_field=None, ndim=None):
+        """
+        Receptive field formula for general sliding kernel based layers
+        This works for both convolutional and pooling layers.
+
+        Notes:
+            Baseline formulas are from [1]. Information about how to include
+            dilation (atrous) convolutions can be found in [2, 3].  Better info
+            seems to be available in [4].
+
+            * tensorflow has similar functionality
+            https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/receptive_field/python/util/receptive_field.py
+
+            * To preserve spatial extent, padding should equal `(k - 1) * d / 2`.
+
+        References:
+            [1] https://medium.com/mlreview/a-guide-to-receptive-field-arithmetic-for-convolutional-neural-networks-e0f514068807
+            [2] http://www.erogol.com/dilated-convolution/
+            [3] https://stackoverflow.com/questions/35582521/how-to-calculate-receptive-field-shape
+            [4] https://arxiv.org/pdf/1603.07285.pdf
+
+        Example:
+            >>> module = nn.Conv2d(1, 1, kernel_size=5, stride=2, padding=2, dilation=3)
+            >>> field = ReceptiveFieldFor._kernelized(module)
+            >>> print(ub.repr2(field, nl=0, with_dtype=False))
+            {'crop': np.array([4., 4.]), 'shape': np.array([13., 13.]), 'stride': np.array([2., 2.])}
+
+            >>> module = nn.MaxPool2d(kernel_size=3, stride=2, padding=2, dilation=2)
+            >>> field = ReceptiveFieldFor._kernelized(module)
+            >>> print(ub.repr2(field, nl=0, with_dtype=False))
+            {'crop': np.array([0., 0.]), 'shape': np.array([5., 5.]), 'stride': np.array([2., 2.])}
+
+            >>> module = nn.MaxPool2d(kernel_size=3, stride=2, padding=2, dilation=1)
+            >>> field = ReceptiveFieldFor._kernelized(module)
+            >>> print(ub.repr2(field, nl=0, with_dtype=False))
+            {'crop': np.array([-1., -1.]), 'shape': np.array([3., 3.]), 'stride': np.array([2., 2.])}
+
+            >>> module = nn.AvgPool2d(kernel_size=3, stride=2, padding=2)
+            >>> field = ReceptiveFieldFor._kernelized(module)
+            >>> print(ub.repr2(field, nl=0, with_dtype=False))
+            {'crop': np.array([-1., -1.]), 'shape': np.array([3., 3.]), 'stride': np.array([2., 2.])}
+        """
+        # impl = ReceptiveFieldFor.impl
+        if input_field is None:
+            input_field = ReceptiveFieldFor.input()
+
+        # Hack to get the number of space-time dimensions
+        if ndim is None:
+            try:
+                if module.__class__.__name__.endswith('1d'):
+                    ndim = 1
+                elif module.__class__.__name__.endswith('2d'):
+                    ndim = 2
+                elif module.__class__.__name__.endswith('3d'):
+                    ndim = 3
+            except AttributeError:
+                if module.__name__.endswith('1d'):
+                    ndim = 1
+                elif module.__name__.endswith('2d'):
+                    ndim = 2
+                elif module.__name__.endswith('3d'):
+                    ndim = 3
+        if ndim is None:
+            raise ValueError('Cannot infer ndim from {}'.format(module))
+
+        k = ensure_array_nd(module.kernel_size, ndim)
+        s = ensure_array_nd(module.stride, ndim)
+        p = ensure_array_nd(module.padding, ndim)
+        d = ensure_array_nd(getattr(module, 'dilation', 1), ndim)
+
+        # To calculate receptive feild we first need to find the SUPPORT of
+        # this layer. The support is the number/extent of extra surrounding
+        # pixels adding this layer will take into account. Given this, we can
+        # compute the receptive feild wrt the original input by combining this
+        # information with the previous receptive feild.
+        #
+        # In the normal case (with no dilation, d=1) the support is (k - 1).
+        # This is because because the operation is able to see a window of shape
+        # k in the input, and produces a single output pixel (hence the k). The
+        # center input pixel corresponds with the output, so it does not expand
+        # the receptive feild (hence the -1), but all other input pixels do
+        # expand the field (thus the k-1).
+        #
+        # The stride of this layer will not affect the support.
+        #
+        # The dilation of the current layer DOES impact the support.
+        # This expands the effective kernel shape, but it does cause the data
+        # each operation sees to become more diffuse. However, even though what
+        # it sees in that extent is more diffuse, the RF is just a bound, so we
+        # can ignore the diffuseness effect and simply scale the input kernel
+        # shape by the dilation amount. Hense we get
+        support = (k - 1) * d
+
+        """
+        Note the above is correct because:
+
+            import sympy as sym
+            k, d = sym.symbols('k, d')
+
+            # Compute the support from formula in 5.1 of [4]
+            # To understand the relationship tying the dilation rate d and the
+            # output shape o, it is useful to think of the impact of d on the
+            # effective kernel shape. A kernel of shape k dilated by a factor d
+            # has an effective shape.
+            effective_kernel_size = k + (k - 1) * (d - 1)
+            support_v1 = sym.expand(effective_kernel_size - 1)
+
+            # Compute support from our method
+            support_v2 = sym.expand((k - 1) * d)
+
+            # They are equivalent. QED
+            assert sym.Eq(support_v1, support_v2)
+        """
+
+        # Compute how many pixels this layer takes off the side Note that an
+        # even shape kernel results in half pixel crops.  This is expected and
+        # correct. To use the crop in practice take the floor / ceil of the
+        # final result, but in this intermediate stage, subpixel crops are
+        # perfectly valid.
+        crop = ((support / 2.0) - p)
+
+        field = ReceptiveField.coerce({
+            # The new stride only depends on the layer stride and the previous
+            # stride.
+            'stride': input_field['stride'] * s,
+
+            # The stride of the current layer does not impact the receptive
+            # feild, however the stride of the previous layer does. This is
+            # because each pixel in the incoming layer really corresponds
+            # `input_field['stride']` pixels in the original input.
+            'shape':   input_field['shape'] + support * input_field['stride'],
+
+            # Padding does not influence the RF shape, but it does influence
+            # where the start pixel is (i.e. without the right amount of
+            # padding the the edge of the previous layer is cropped).
+            'crop': input_field['crop'] + crop * input_field['stride'],
+        })
+        return field
+
+    @staticmethod
+    def _unchanged(module, input_field=None):
+        """ Formula for layers that do not change the receptive field """
+        if input_field is None:
+            input_field = ReceptiveFieldFor.input()
+        return input_field
+
+    @staticmethod
+    @compute_type(nn.Linear)
+    def linear(module, input_field=None):
+        # Linear layers (sort-of) dont change the RF
+        return ReceptiveFieldFor._unchanged(module, input_field)
+        # Perhaps we could do this if we knew the input shape
+        # raise NotImplementedError(
+        #     'Cannot compute receptive field shape on a Linear layer')
+
+    @staticmethod
+    def _kernelized_tranpose(module, input_field=None):
+        """
+        Receptive field formula for pooling layers
+
+        Example:
+            >>> from netharn.analytic.receptive_field_for import *
+            >>> from netharn.analytic.output_shape_for import *
+            >>> module = nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, padding=2)
+            >>> ReceptiveFieldFor(module)()
+
+            >>> # This network should effectively invert itself
+            >>> module = nn.Sequential(ub.odict([
+            >>>     #('a', nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=1)),
+            >>>     ('c1', nn.Conv2d(1, 1, kernel_size=3, stride=2)),
+            >>>     ('c2', nn.Conv2d(1, 1, kernel_size=3, stride=2)),
+            >>>     ('c3', nn.Conv2d(1, 1, kernel_size=3, stride=2)),
+            >>>     ('c3T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2)),
+            >>>     ('c2T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2)),
+            >>>     ('c1T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2)),
+            >>> ]))
+            >>> print(ub.repr2(ReceptiveFieldFor(module)()))
+            >>> ReceptiveFieldFor(module)()
+            >>> OutputShapeFor(module)._check_consistency([1, 1, 32, 32])
+
+            >>> module = nn.Sequential(ub.odict([
+            >>>     #('a', nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=1)),
+            >>>     ('c1', nn.Conv2d(1, 1, kernel_size=3, stride=2, dilation=2)),
+            >>>     ('c2', nn.Conv2d(1, 1, kernel_size=3, stride=2, dilation=2)),
+            >>>     ('c3', nn.Conv2d(1, 1, kernel_size=3, stride=2, dilation=2)),
+            >>>     ('c3T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, dilation=2)),
+            >>>     ('c2T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, dilation=2)),
+            >>>     ('c1T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, dilation=2)),
+            >>> ]))
+            >>> print(ub.repr2(ReceptiveFieldFor(module)()))
+
+            >>> # This network is pathological
+            >>> module = nn.Sequential(ub.odict([
+            >>>     #('a', nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=1)),
+            >>>     ('c1', nn.Conv2d(1, 1, kernel_size=3, stride=7, dilation=2)),
+            >>>     ('c2', nn.Conv2d(1, 1, kernel_size=5, stride=6, padding=1)),
+            >>>     ('c3', nn.Conv2d(1, 1, kernel_size=7, stride=5)),
+            >>>     ('c3T', nn.ConvTranspose2d(1, 1, kernel_size=7, stride=6)),
+            >>>     ('c2T', nn.ConvTranspose2d(1, 1, kernel_size=5, stride=7, padding=1)),
+            >>>     ('c1T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=8, dilation=2)),
+            >>> ]))
+            >>> print(ub.repr2(ReceptiveFieldFor(module)()))
+            >>> ReceptiveFieldFor(module)()
+            >>> OutputShapeFor(module)([1, 1, 900, 900])
+            >>> OutputShapeFor(module)([1, 1, 900, 900]).hidden
+            >>> OutputShapeFor(module)._check_consistency([1, 1, 900, 900])
+
+            >>> module = nn.Sequential(
+            >>>     nn.Conv2d(1, 1, kernel_size=3, stride=2),
+            >>>     nn.Conv2d(1, 1, kernel_size=3, stride=2),
+            >>>     nn.Conv2d(1, 1, kernel_size=3, stride=2),
+            >>>     nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2),
+            >>>     nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2),
+            >>>     nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2),
+            >>> )
+            >>> ReceptiveFieldFor(module)()
+
+            >>> module = nn.Conv2d(1, 1, kernel_size=3, stride=2, padding=1)
+            >>> ReceptiveFieldFor(module)()
+
+            >>> OutputShapeFor(nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, padding=0, output_padding=(1, 1)))._check_consistency([1, 1, 1, 1])
+
+            >>> # Figure 4.4
+            >>> OutputShapeFor(nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=2))([1, 1, 5, 5])
+            >>> OutputShapeFor(nn.ConvTranspose2d(1, 1, kernel_size=3, stride=1, padding=2))._check_consistency([1, 1, 5, 5])
+            >>> OutputShapeFor(nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=0))([1, 1, 7, 7])
+
+            >>> # Figure 4.5
+            >>> OutputShapeFor(nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, padding=0))._check_consistency([1, 1, 5, 5])
+            >>> OutputShapeFor(nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=0))([1, 1, 7, 7])
+
+            >>> ReceptiveFieldFor(module)()
+        """
+        # impl = ReceptiveFieldFor.impl
+        if input_field is None:
+            input_field = ReceptiveFieldFor.input()
+
+        # Hack to get the number of space-time dimensions
+        ndim = None
+        try:
+            if module.__name__.endswith('1d'):
+                ndim = 1
+            elif module.__name__.endswith('2d'):
+                ndim = 2
+            elif module.__name__.endswith('3d'):
+                ndim = 3
+        except AttributeError:
+            pass
+
+        if ndim is None:
+            if hasattr(module, '_dim'):
+                ndim = module._dim
+
+        # A non-trivial transpose convolution should:
+        # * decrease the stride (because the stride is fractional)
+        # the padding has to be equal to the shape of the kernel minus one
+        """
+        From [4]:
+
+        A convolution described by k, s and p has an associated transposed convolution described by:
+        * k' = k,
+        * s' = 1,
+        * p' = k - p - 1,
+        * i' = the shape of the stretched input obtained by adding s − 1 zeros
+            between each input unit,
+        * a = (i + 2p − k) % s, represents the number of zeros added to the
+         bottom and right edges of the input,
+
+         And has output shape:
+             o' = s(i' - 1) + a + k - 2p
+
+        For convT it is always the case that s'=1, howver, note that s' is not
+        what we use to compute the new stride of the output, because that is
+        actually a fractional stride.
+        """
+
+        # Definitions:
+        # In the following comments we discuss 3 distinct layers
+        # (1) The original convolution (conv)
+        # (2) The transpose convolution that inverts the original (convT)
+        # (3) The regular convolution that is equivalent to the transpose
+        # convolution given a specially transformed input tensor (convE)
+
+        # The parameters of a convT are actually the parameters of conv, the
+        # convolution we are trying to "undo", but we will refer to them as
+        # parameters of convT (because they are that as well).
+        k_ = ensure_array_nd(module.kernel_size, ndim)
+        s_ = ensure_array_nd(module.stride, ndim)
+        p_ = ensure_array_nd(module.padding, ndim)
+        d_ = ensure_array_nd(getattr(module, 'dilation', 1), ndim)
+
+        # TODO: incorporate output padding and right-side padding / cropping
+        # Note: output padding does not impact the receptive field, however it
+        # does cause some "right-side" croping, which we are not computing here
+        # yet.
+
+        out_pad = ensure_array_nd(module.output_padding, ndim)  # NOQA
+        # if not np.all(out_pad == 0):
+        #     raise NotImplementedError('cannot handle nonzero output_padding yet')
+
+        # Howver, there is an equivalent way of forumulating a convT as convE:
+        # a regular conv applied on a specially padded input tensor.
+        # The parameters that define convE are:
+        k = k_
+        d = d_
+        s = 1  # stride is always 1 because of the special input transform
+        # p = k_ - p_ - 1  # NOTE: original formula likely assumed dilation=1
+        p = (k_ - 1) * d_ - p_
+
+        # In order for convE to be equivalent to convT, we need to apply convE
+        # to a specially transformed (padded) input tensor.
+        # The padding applied to the input tensor puts extra zeros between each
+        # row/col. The number of extra zeros is the stride of the convT - 1.
+        # The left and right sides of the input tensor are also padded but that
+        # wont factor into the RF calculation.
+        extra_zeros = s_ - 1
+        # This means that the effective support added to the RF shape by convE
+        # will be less than it normally would because we don't count the extra
+        # zeros in our transformed input as real pixels.
+        effective_support = (k - 1 - extra_zeros) * d
+        # NOTE; if the stride is larger than the kernel, some output pixels
+        # will actually just be zeros and have no receptive feild.
+        effective_support = np.maximum(0, effective_support)
+
+        # This special input transform also has the effect of decreasing the RF
+        # stride.  Transpose conv are sometimes called fractional-stride
+        # convolutions This is because they have an effective stride of 1 / s_
+        effective_stride = 1 / s_
+
+        # We calculate the support of convE as if were applied to a normal
+        # input tensor in order to calculate how the start (top-left) pixel
+        # position is modified.
+        support = (k - 1) * d
+
+        # After transformation the effective stride of the input is
+        effective_input_stride = input_field['stride'] * effective_stride
+
+        # how many pixels does this layer crop off the sides of the input
+        crop = ((support / 2) - p)
+
+        # print('effective_support = {!r}'.format(effective_support))
+
+        field = ReceptiveField.coerce({
+            # The new stride only depends on the layer stride and the previous
+            # stride.
+            'stride': effective_input_stride * s,
+
+            # The stride of the current layer does not impact the receptive
+            # feild, however the stride of the previous layer does. This is
+            # because each pixel in the incoming layer really corresponds
+            # `input_field['stride']` pixels in the original input.
+            'shape':   input_field['shape'] + effective_support * input_field['stride'],
+
+            # Padding does not influence the RF shape, but it does influence
+            # where the start pixel is (i.e. without the right amount of
+            # padding the the edge of the previous layer is cropped).
+            'crop': input_field['crop'] + crop * effective_input_stride,
+        })
+
+        return field
+        # raise NotImplementedError('todo')
+
+    @compute_type(nn.modules.conv._ConvTransposeMixin)
+    def convT(module, input_field=None):
+        return ReceptiveFieldFor._kernelized_tranpose(module, input_field)
+
+    @compute_type(nn.modules.conv.Conv1d, nn.modules.conv.Conv2d, nn.modules.conv.Conv3d)
+    def convnd(module, input_field=None):
+        return ReceptiveFieldFor._kernelized(module, input_field)
+
+    @staticmethod
+    @compute_type(nn.modules.pooling._MaxPoolNd)
+    def maxpoolnd(module, input_field=None):
+        return ReceptiveFieldFor._kernelized(module, input_field)
+
+    @staticmethod
+    @compute_type(nn.modules.pooling._AvgPoolNd)
+    def avepoolnd(module, input_field=None):
+        return ReceptiveFieldFor._kernelized(module, input_field)
+
+    @staticmethod
+    @compute_type(nn.modules.pooling._AdaptiveMaxPoolNd, nn.modules.pooling._AdaptiveAvgPoolNd)
+    def adaptive_avepoolnd(module, input_field=None):
+        """
+        it is not possible to analytically compute an adaptive receptive field.
+
+        References:
+            https://forums.fast.ai/t/ideas-behind-adaptive-max-pooling/12634/3
+            https://arxiv.org/abs/1406.4729
+        """
+        raise Exception('not possible to compute adaptive RF without knowning the input_shape ahead of time')
+        # return ReceptiveFieldFor._kernelized(module, input_field)
+
+    @staticmethod
+    @compute_type(nn.ReLU)
+    def relu(module, input_field=None):
+        return ReceptiveFieldFor._unchanged(module, input_field)
+
+    @staticmethod
+    @compute_type(nn.ReLU6, nn.PReLU, nn.LeakyReLU, nn.ELU, nn.CELU, nn.SELU)
+    def _unchanged_activation(module, input_field=None):
+        return ReceptiveFieldFor._unchanged(module, input_field)
+
+    @staticmethod
+    @compute_type(nn.functional.relu, nn.functional.relu6)
+    def _unchanged_activation_func(input_field=None):
+        # return ReceptiveFieldFor._unchanged(module, input_field)
+        return ReceptiveFieldFor._unchanged(None, input_field)
+
+    @staticmethod
+    @compute_type(nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d,
+                  nn.modules.normalization.GroupNorm,
+                  nn.modules.normalization.LocalResponseNorm,
+                  nn.modules.normalization.LayerNorm, nn.CrossMapLRN2d,
+                  nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d)
+    def normalization(module, input_field=None):
+        return ReceptiveFieldFor._unchanged(module, input_field)
+
+    @staticmethod
+    @compute_type(nn.modules.dropout._DropoutNd)
+    def dropout(module, input_field=None):
+        return ReceptiveFieldFor._unchanged(module, input_field)
+
+    @staticmethod
+    @compute_type(nn.Sequential)
+    def sequential(module, input_field=None):
+        """
+        Example:
+            >>> import netharn as nh
+            >>> self = nn.Sequential(
+            >>>     nn.Conv2d(2, 3, kernel_size=3),
+            >>>     nn.Conv2d(3, 5, kernel_size=3),
+            >>>     nn.Conv2d(5, 7, kernel_size=3),
+            >>> )
+            >>> rfield = nh.ReceptiveFieldFor(self)()
+            >>> print('rfield = {}'.format(ub.repr2(rfield, nl=1, with_dtype=False)))
+            rfield = {
+                'crop': np.array([3., 3.]),
+                'shape': np.array([7., 7.]),
+                'stride': np.array([1., 1.]),
+            }
+        """
+        if input_field is None:
+            input_field = ReceptiveFieldFor.input()
+        rfield = input_field
+        hidden = HiddenFields()
+        iter_ = iter(module._modules.items())
+        for key, child in module._modules.items():
+            key, child = next(iter_)
+            if hasattr(child, 'receptive_field_for'):
+                rfield = hidden[key] = child.receptive_field_for(rfield)
+            else:
+                rfield = hidden[key] = ReceptiveFieldFor(child)(rfield)
+        rfield = ReceptiveField.coerce(rfield)
+        rfield.hidden = hidden
+        return rfield
+
+    @staticmethod
+    @compute_type(torch.nn.DataParallel)
+    def data_parallel(module, *args, **kw):
+        return ReceptiveFieldFor(module.module)(*args, **kw)
+
+
+class _TorchvisionMixin(object):
+    """
+    Compute receptive fields for components of torchvision models
+    """
+
+    @staticmethod
+    @compute_type(torchvision.models.resnet.BasicBlock)
+    def resent_basic_block(module, input_field=None):
+        """
+        Example:
+            >>> # xdoctest: +REQUIRES(--network)
+            >>> import torchvision  # NOQA
+            >>> module = torchvision.models.resnet18().layer1[0]
+            >>> field = ReceptiveFieldFor(module)()
+            >>> print(ub.repr2(field.hidden, nl=1, with_dtype=False))
+            {
+                'conv1': {'crop': np.array([0., 0.]), 'shape': np.array([3., 3.]), 'stride': np.array([1., 1.])},
+                'bn1': {'crop': np.array([0., 0.]), 'shape': np.array([3., 3.]), 'stride': np.array([1., 1.])},
+                'relu1': {'crop': np.array([0., 0.]), 'shape': np.array([3., 3.]), 'stride': np.array([1., 1.])},
+                'conv2': {'crop': np.array([0., 0.]), 'shape': np.array([5., 5.]), 'stride': np.array([1., 1.])},
+                'bn2': {'crop': np.array([0., 0.]), 'shape': np.array([5., 5.]), 'stride': np.array([1., 1.])},
+                'relu2': {'crop': np.array([0., 0.]), 'shape': np.array([5., 5.]), 'stride': np.array([1., 1.])},
+            }
+        """
+        if input_field is None:
+            input_field = ReceptiveFieldFor.input()
+        hidden = HiddenFields()
+
+        rfield = input_field
+
+        rfield = hidden['conv1'] = ReceptiveFieldFor(module.conv1)(rfield)
+        rfield = hidden['bn1'] = ReceptiveFieldFor(module.bn1)(rfield)
+        rfield = hidden['relu1'] = ReceptiveFieldFor(module.relu)(rfield)
+
+        rfield = hidden['conv2'] = ReceptiveFieldFor(module.conv2)(rfield)
+        rfield = hidden['bn2'] = ReceptiveFieldFor(module.bn2)(rfield)
+        rfield = hidden['relu2'] = ReceptiveFieldFor(module.relu)(rfield)
+
+        if module.downsample is not None:
+            hidden['downsample'] = ReceptiveFieldFor(module.downsample)(input_field)
+
+        rfield = ReceptiveFieldFor(module.relu)(rfield)
+        rfield.hidden = hidden
+        return rfield
+
+    @staticmethod
+    @compute_type(torchvision.models.resnet.Bottleneck)
+    def resent_bottleneck(module, input_field=None):
+        """
+        CommandLine:
+            xdoctest -m netharn.analytic.receptive_field_for _TorchvisionMixin.resent_bottleneck --network
+
+        Example:
+            >>> # xdoctest: +REQUIRES(--network)
+            >>> import torchvision  # NOQA
+            >>> module = torchvision.models.resnet50().layer1[0]
+            >>> field = ReceptiveFieldFor(module)()
+            >>> print(ub.repr2(field.hidden.shallow(1), nl=1, with_dtype=False))
+            {
+                'conv1': {'crop': ...([0., 0.]), 'shape': ...([1., 1.]), 'stride': ...([1., 1.])},
+                'bn1': {'crop': ...([0., 0.]), 'shape': ...([1., 1.]), 'stride': ...([1., 1.])},
+                'relu1': {'crop': ...([0., 0.]), 'shape': ...([1., 1.]), 'stride': ...([1., 1.])},
+                'conv2': {'crop': ...([0., 0.]), 'shape': ...([3., 3.]), 'stride': ...([1., 1.])},
+                'bn2': {'crop': ...([0., 0.]), 'shape': ...([3., 3.]), 'stride': ...([1., 1.])},
+                'relu2': {'crop': ...([0., 0.]), 'shape': ...([3., 3.]), 'stride': ...([1., 1.])},
+                'conv3': {'crop': ...([0., 0.]), 'shape': ...([3., 3.]), 'stride': ...([1., 1.])},
+                'bn3': {'crop': ...([0., 0.]), 'shape': ...([3., 3.]), 'stride': ...([1., 1.])},
+                'downsample': {'crop': ...([0., 0.]), 'shape': ...([1., 1.]), 'stride': ...([1., 1.])},
+            }
+        """
+        if input_field is None:
+            input_field = ReceptiveFieldFor.input()
+        rfield = input_field
+        hidden = HiddenFields()
+
+        rfield = hidden['conv1'] = ReceptiveFieldFor(module.conv1)(rfield)
+        rfield = hidden['bn1'] = ReceptiveFieldFor(module.bn1)(rfield)
+        rfield = hidden['relu1'] = ReceptiveFieldFor(module.relu)(rfield)
+
+        rfield = hidden['conv2'] = ReceptiveFieldFor(module.conv2)(rfield)
+        rfield = hidden['bn2'] = ReceptiveFieldFor(module.bn2)(rfield)
+        rfield = hidden['relu2'] = ReceptiveFieldFor(module.relu)(rfield)
+
+        rfield = hidden['conv3'] = ReceptiveFieldFor(module.conv3)(rfield)
+        rfield = hidden['bn3'] = ReceptiveFieldFor(module.bn3)(rfield)
+
+        if module.downsample is not None:
+            hidden['downsample'] = ReceptiveFieldFor(module.downsample)(input_field)
+
+        rfield = ReceptiveFieldFor(module.relu)(rfield)
+        rfield.hidden = hidden
+        return rfield
+
+    @staticmethod
+    @compute_type(torchvision.models.resnet.ResNet)
+    def resnet_model(module, input_field=None, input_shape=None):
+        """
+        CommandLine:
+            xdoctest -m netharn.analytic.receptive_field_for _TorchvisionMixin.resnet_model --network
+
+        Example:
+            >>> # DISABLE_DOCTEST
+            >>> # Note: newest torchvision breaks this
+            >>> # xdoctest: +REQUIRES(--network)
+            >>> from netharn.analytic.receptive_field_for import *
+            >>> module = torchvision.models.resnet50()
+            >>> input_shape = (1, 3, 224, 224)
+            >>> field = ReceptiveFieldFor(module)(input_shape=input_shape)
+            >>> print(ub.repr2(field.hidden.shallow(1), nl=1, with_dtype=False))
+            {
+                'conv1': {'crop': ...([0., 0.]), 'shape': ...([7., 7.]), 'stride': ...([2., 2.])},
+                'bn1': {'crop': ...([0., 0.]), 'shape': ...([7., 7.]), 'stride': ...([2., 2.])},
+                'relu1': {'crop': ...([0., 0.]), 'shape': ...([7., 7.]), 'stride': ...([2., 2.])},
+                'maxpool': {'crop': ...([0., 0.]), 'shape': ...([11., 11.]), 'stride': ...([4., 4.])},
+                'layer1': {'crop': ...([0., 0.]), 'shape': ...([35., 35.]), 'stride': ...([4., 4.])},
+                'layer2': {'crop': ...([0., 0.]), 'shape': ...([91., 91.]), 'stride': ...([8., 8.])},
+                'layer3': {'crop': ...([0., 0.]), 'shape': ...([267., 267.]), 'stride': ...([16., 16.])},
+                'layer4': {'crop': ...([0., 0.]), 'shape': ...([427., 427.]), 'stride': ...([32., 32.])},
+                'avgpool': {'crop': ...([96., 96.]), 'shape': ...([619., 619.]), 'stride': ...([32., 32.])},
+                'flatten': {'crop': ...([96., 96.]), 'shape': ...([811., 811.]), 'stride': ...([32., 32.])},
+                'fc': {'crop': ...([96., 96.]), 'shape': ...([811., 811.]), 'stride': ...([32., 32.])},
+            }
+
+        """
+        if input_field is None:
+            input_field = ReceptiveFieldFor.input()
+        rfield = input_field
+        hidden = HiddenFields()
+        rfield = hidden['conv1'] = ReceptiveFieldFor(module.conv1)(rfield)
+        rfield = hidden['bn1'] = ReceptiveFieldFor(module.bn1)(rfield)
+        rfield = hidden['relu1'] = ReceptiveFieldFor(module.relu)(rfield)
+        rfield = hidden['maxpool'] = ReceptiveFieldFor(module.maxpool)(rfield)
+
+        rfield = hidden['layer1'] = ReceptiveFieldFor(module.layer1)(rfield)
+        rfield = hidden['layer2'] = ReceptiveFieldFor(module.layer2)(rfield)
+        rfield = hidden['layer3'] = ReceptiveFieldFor(module.layer3)(rfield)
+        rfield = hidden['layer4'] = ReceptiveFieldFor(module.layer4)(rfield)
+
+        rfield = hidden['avgpool'] = ReceptiveFieldFor(module.avgpool)(rfield)
+
+        if input_shape is None:
+            raise ValueError('input shape is required')
+
+        output_shape = OutputShapeFor(module)(input_shape)
+        avgpool_shape = output_shape.hidden.shallow(1)['layer4']
+        spatial_shape = np.array(avgpool_shape[2:])
+
+        # Keep everything the same except increase the RF shape
+        # based on how many output pixels there are.
+        rfield_flatten = ReceptiveField.coerce(dict(**rfield))
+        # not sure if this is 100% correct
+        rfield_flatten['shape'] = rfield['shape'] + (spatial_shape - 1) * rfield['stride']
+        rfield = hidden['flatten'] = rfield_flatten
+
+        # The reshape operation will blend the receptive fields of the inputs
+        # but it will depend on the output shape of the layer.
+        # rfield = (rfield[0], prod(rfield[1:]))
+
+        rfield = hidden['fc'] = ReceptiveFieldFor(module.fc)(rfield)
+        rfield.hidden = hidden
+        return rfield
+
+
+class ReceptiveFieldFor(analytic_for.OutputFor, _TorchMixin, _TorchvisionMixin):
+    """
+    Knows how to compute the receptive fields for many pytorch primatives and
+    some torchvision components.
+
+    References:
+        https://medium.com/mlreview/a-guide-to-receptive-field-arithmetic-for-convolutional-neural-networks-e0f514068807
+
+    Returns:
+        Tuple[object, Dict]:
+            fields: object: The hidden layer recepvive fields (can be complex due to nesting)
+            field: Dict: a dictionary containing receptive field information.
+
+    Notes:
+        A 1-D Pixel
+            +-----+
+            ^  ^  ^
+          left |  L right
+               |
+             center
+
+    Example:
+        >>> # Case where we have a registered func
+        >>> from netharn.analytic.receptive_field_for import *
+        >>> self = nn.Sequential(
+        >>>     nn.Conv2d(2, 3, kernel_size=3),
+        >>>     nn.Conv2d(3, 5, kernel_size=3),
+        >>> )
+        >>> rfield = ReceptiveFieldFor(self)()
+        >>> print('rfield.hidden = {}'.format(ub.repr2(rfield.hidden, nl=3, with_dtype=False)))
+        >>> print('rfield = {}'.format(ub.repr2(rfield, nl=1, with_dtype=False)))
+        rfield.hidden = {
+            '0': {
+                'crop': np.array([1., 1.]),
+                'shape': np.array([3., 3.]),
+                'stride': np.array([1., 1.]),
+            },
+            '1': {
+                'crop': np.array([2., 2.]),
+                'shape': np.array([5., 5.]),
+                'stride': np.array([1., 1.]),
+            },
+        }
+        rfield = {
+            'crop': np.array([2., 2.]),
+            'shape': np.array([5., 5.]),
+            'stride': np.array([1., 1.]),
+        }
+
+    Example:
+        >>> # Case where we haven't registered a func
+        >>> self = nn.Conv2d(2, 3, kernel_size=3)
+        >>> rfield = ReceptiveFieldFor(self)()
+        >>> print('rfield = {}'.format(ub.repr2(rfield, nl=1, with_dtype=False)))
+        rfield = {
+            'crop': np.array([1., 1.]),
+            'shape': np.array([3., 3.]),
+            'stride': np.array([1., 1.]),
+        }
+
+    Example:
+        >>> # xdoctest: +REQUIRES(--network)
+        >>> import torchvision  # NOQA
+        >>> module = torchvision.models.alexnet().features
+        >>> field = ReceptiveFieldFor(module)()
+        >>> print(ub.repr2(field, nl=1, with_dtype=False))
+        {
+            'crop': np.array([31., 31.]),
+            'shape': np.array([195., 195.]),
+            'stride': np.array([32., 32.]),
+        }
+    """
+    # impl = math  # for hacking in sympy
+
+    def __init__(self, module):
+        self.module = module
+        self._func = getattr(module, 'receptive_field_for', None)
+        if self._func is None:
+            # Lookup rfield func if we can't find it
+            found = []
+            for type, _func in REGISTERED_TYPES:
+                try:
+                    if module is type or isinstance(module, type):
+                        found.append(_func)
+                except TypeError:
+                    pass
+            if len(found) == 1:
+                self._func = found[0]
+            elif len(found) == 0:
+                raise ReceptiveFieldTypeError('Unknown (rf) module type {}'.format(module))
+            else:
+                raise AssertionError('Ambiguous (rf) module {}. Found {}'.format(module, found))
+
+    def __call__(self, *args, **kwargs):
+        if isinstance(self.module, nn.Module):
+            # bound methods dont need module
+            is_bound  = hasattr(self._func, '__func__') and getattr(self._func, '__func__', None) is not None
+            is_bound |= hasattr(self._func, 'im_func') and getattr(self._func, 'im_func', None) is not None
+            if is_bound:
+                rfield = self._func(*args, **kwargs)
+            else:
+                # nn.Module with state
+                rfield = self._func(self.module, *args, **kwargs)
+        else:
+            # a simple pytorch func
+            rfield = self._func(*args, **kwargs)
+
+        rfield = ReceptiveField.coerce(rfield)
+        return rfield
+
+    # @staticmethod
+    # def view(arr, *args):
+    #     """
+    #     Wraps view calls
+
+    #     Example:
+    #         >>> arr = (2, 32, 9, 9)
+    #         >>> result = OutputShapeFor.view(arr, -1)
+    #         >>> assert result == (5184,)
+    #     """
+    #     from netharn import layers
+    #     reshape = layers.Reshape(*args)
+    #     return reshape.output_shape_for(arr)
+
+    # @staticmethod
+    def shape(arr):
+        """
+        Wraps shape calls
+        """
+        raise ReceptiveFieldTypeError('RF is currently unable to inspect output shape')
+
+    @staticmethod
+    def _elementwise(field1, field2):
+        # Combines two receptive fields in an elementwise fashion
+        field = ReceptiveField({
+            'shape': np.maximum(field1['shape'], field2['shape']),
+            'crop': np.maximum(field1['crop'], field2['crop']),
+            'stride': np.maximum(field1['stride'], field2['stride']),
+        })
+        return field
+
+    @staticmethod
+    def add(field1, field2):
+        return ReceptiveFieldFor._elementwise(field1, field2)
+
+    @staticmethod
+    def mul(field1, field2):
+        return ReceptiveFieldFor._elementwise(field1, field2)
+
+    @staticmethod
+    def sub(field1, field2):
+        return ReceptiveFieldFor._elementwise(field1, field2)
+
+    @staticmethod
+    def div(field1, field2):
+        return ReceptiveFieldFor._elementwise(field1, field2)
+
+
+def effective_receptive_feild(module, inputs, output_key=None, sigma=0,
+                              thresh=1.00, ignore_norms=True,
+                              ignore_extra=None):
+    """
+    Empirically measures the effective receptive feild of a network
+
+    Method from [0], implementation loosely based on [1].
+
+    Args:
+        module (torch.nn.Module) : the network
+
+        inputs (torch.nn.Tensor) : the input to the network. Must share the
+            same device as `module`.
+
+        output_key (None | str | Callable): If the network outputs a non-tensor
+            then this should be a function that does postprocessing and returns
+            a relevant Tensor that can be used to compute gradients. If the
+            output is a dictionary then this can also be a string-based key
+            used to lookup the appropriate output.
+
+        sigma (float, default=0): smoothness factor (via gaussian blur)
+
+        thresh (float, default=1.00): only consider this fraction of the
+            data as meaningful (i.e. find the effective RF shape that explains
+            95% of the data). A threshold of 1.0 or greater does nothing.
+
+        ignore_norms (bool, default=True): if True ignores normalization layers
+            like batch and group norm which adds negligable, but non-zero
+            impact everywhere and causes the ERF shape estimation to be
+            dramatically greater than it should be (although the impact still
+            makes sense).
+
+        ignore_extra (List[type], optioanl): if specified, any layer that is a
+            subclass of one of these types is also ignored.
+
+    Returns:
+        dict: containing keys
+            'shape' containing the effective RF shape and
+            'impact' which contains the thresholded distribution
+
+    References:
+        [0] https://arxiv.org/pdf/1701.04128.pdf
+        [1] https://github.com/rogertrullo/Receptive-Field-in-Pytorch/blob/master/compute_RF.py
+
+    Example:
+        >>> from netharn.analytic.receptive_field_for import *
+        >>> import torchvision  # NOQA
+        >>> module = nn.Sequential(*[nn.Conv2d(1, 1, 3) for i in range(10)])
+        >>> inputs = torch.rand(1, 1, 200, 200)
+        >>> emperical_field = effective_receptive_feild(module, inputs)
+        >>> theoretic_field = ReceptiveFieldFor(module)()
+        >>> # The emperical results should never be bigger than the theoretical
+        >>> assert np.all(emperical_field['shape'] <= theoretic_field['shape'])
+
+        >>> # xdoctest: +REQUIRES(--slow)
+        >>> module = torchvision.models.alexnet().features
+        >>> inputs = torch.rand(1, 3, 224, 224)
+        >>> emperical_field = effective_receptive_feild(module, inputs)
+        >>> theoretic_field = ReceptiveFieldFor(module)()
+        >>> # The emperical results should never be bigger than the theoretical
+        >>> assert np.all(emperical_field['shape'] <= theoretic_field['shape'])
+
+        >>> # xdoctest: +REQUIRES(--slow)
+        >>> import netharn as nh
+        >>> xpu = nh.XPU.coerce('auto')
+        >>> module = xpu.move(torchvision.models.vgg11_bn().features)
+        >>> inputs = xpu.move(torch.rand(1, 3, 224, 224))
+        >>> emperical_field = effective_receptive_feild(module, inputs)
+        >>> theoretic_field = ReceptiveFieldFor(module)()
+        >>> # The emperical results should never be bigger than the theoretical
+        >>> assert np.all(emperical_field['shape'] <= theoretic_field['shape'])
+
+        >>> # xdoctest: +REQUIRES(--show)
+        >>> import kwplot
+        >>> kwplot.autompl()
+        >>> kwplot.imshow(emperical_field['impact'], doclf=True)
+
+    Ignore:
+        >>> xpu = nh.XPU.coerce('auto')
+        >>> module = xpu.move(torchvision.models.resnet50())
+        >>> inputs = xpu.move(torch.rand(8, 3, 224, 224))
+        >>> emperical_field = effective_receptive_feild(module, inputs)
+        >>> import kwplot
+        >>> kwplot.autompl()
+        >>> kwplot.imshow(emperical_field['impact'], doclf=True)
+    """
+    import netharn as nh
+
+    # zero gradients
+    for p in module.parameters():
+        if p.grad is not None:
+            p.grad.detach_()
+            p.grad.zero_()
+
+    if inputs.grad is not None:
+        inputs.grad.detach_()
+        inputs.grad.zero_()
+
+    inputs.requires_grad = True
+    # if inputs.grad is not None:
+    #     raise ValueError('inputs alread has accumulated gradients')
+
+    # Completely ignore BatchNorm layers as they will give the entire input
+    # some negligable but non-zero effect on the receptive feild.
+    ignored = []
+    if ignore_norms:
+        ignored += [
+            nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d,
+            nn.modules.normalization.GroupNorm,
+            nn.modules.normalization.LocalResponseNorm,
+            nn.modules.normalization.LayerNorm, nn.CrossMapLRN2d,
+            nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d,
+            nh.layers.L2Norm,
+        ]
+    if ignore_extra:
+        ignored += ignore_extra
+    with nh.util.IgnoreLayerContext(module, tuple(ignored)):
+        outputs = module(inputs)
+
+    # Note: grab a single (likely FCN) output channel
+    if callable(output_key):
+        output_y = output_key(outputs)
+    elif output_key is None:
+        output_y = outputs
+    else:
+        output_y = outputs[output_key]
+    # elif isinstance(output_key, (six.string_types, int)):
+    # else:
+    #     raise TypeError('output_key={} is not understood'.format(output_key))
+
+    if not isinstance(output_y, torch.Tensor):
+        raise TypeError(
+            'The output is a {}, not a tensor. Please specify '
+            'output_key and ensure it returns a Tensor.'.format(type(outputs)))
+
+    # Note: this still does the right thing if there is no spatial component.
+    # because all outputs are center outputs.
+    center_dims = (np.array(output_y.shape[2:]) // 2).tolist()
+    center_slice = [slice(None), slice(None)] + center_dims
+
+    # We dont need to compute a loss because we can explicitly set gradients.
+    # Yay torch!
+    # Explicilty set ∂l/∂y[:] = 0
+    # Explicilty set ∂l/∂y[center] = 1
+    grad_loss_wrt_y = torch.zeros_like(output_y)
+    grad_loss_wrt_y[...] = 0
+    grad_loss_wrt_y[center_slice] = 1
+
+    # Backpropogate as if the grad of the loss wrt to y[center] was 1.
+    # Note: this can take a long time on the CPU (sometimes?)
+    output_y.backward(gradient=grad_loss_wrt_y)
+
+    # The input gradient is now a measure of how much it can impact the output.
+    impact = inputs.grad.abs()
+
+    # Average the impact over all batches and all channels
+    average_impact = impact.mean(dim=0).mean(dim=0)
+
+    if isinstance(average_impact, torch.Tensor):
+        average_impact = average_impact.data.cpu().numpy()
+
+    idx_nonzeros = np.where(average_impact != 0)
+    rf_bounds = [(0, 0) if len(idx) == 0 else (idx.min(), idx.max()) for idx in idx_nonzeros]
+    rf_shape = [(mx - mn + 1) for mn, mx in rf_bounds]
+    rf_slice = tuple([slice(mn, mx + 1) for mn, mx in rf_bounds])
+
+    # Crop out the average impact zone for visualization
+    # Normalize to have a maximum value of 1.0
+    rf_impact = average_impact[rf_slice]
+    rf_impact /= rf_impact.max()
+
+    rf_impact = torch.FloatTensor(rf_impact)
+    if sigma > 0:
+        # Smooth things out
+        _blur = nh.layers.GaussianBlurNd(dim=1, num_features=1, sigma=sigma)
+        _blur.to(rf_impact.device)
+        rf_impact = _blur(rf_impact[None, None])[0, 0]
+
+    if thresh < 1:
+        density = rf_impact.contiguous().view(-1).cpu().numpy().copy()
+        density.sort()
+        density = density[::-1]
+        # Find the value threshold that explains thresh (e.g. 95%) of the data
+        idx = np.where(density.cumsum() > thresh * density.sum())[0]
+        lowval = float(density[idx[0]])
+
+        effective_impact = rf_impact * (rf_impact > lowval).float()
+        effective_idx_nonzeros = np.where(effective_impact != 0)
+        effective_rf_bounds = [(idx.min(), idx.max()) for idx in effective_idx_nonzeros]
+        effective_shape = [(mx - mn + 1) for mn, mx in effective_rf_bounds]
+    else:
+        effective_impact = rf_impact
+        effective_rf_bounds = rf_shape
+        effective_shape = rf_shape
+
+    emperical_field = {
+        'shape': effective_shape,
+        'impact': effective_impact,
+        'thresh': thresh,
+    }
+    return emperical_field
+
+
+if __name__ == '__main__':
+    """
+    CommandLine:
+        xdoctest -m netharn.analytic.receptive_field_for all --network
+    """
+    import xdoctest
+    xdoctest.doctest_module(__file__)
diff --git a/netharn/analytic_for.py b/netharn/analytic_for.py
index 9c413eb360cde22e31b575e4475047a8cdbe46d8..0d777721e065bd4ee3116be634fe4e2f3dfc4717 100644
--- a/netharn/analytic_for.py
+++ b/netharn/analytic_for.py
@@ -1,140 +1,3 @@
-"""
-Code for commonalities between "X for" objects that compute analytic properties
-of networks like OutputShapeFor and ReceptiveFieldFor
-"""
-import ubelt as ub
-from collections import OrderedDict
-
-
-class Hidden(OrderedDict, ub.NiceRepr):
-    """ Object for storing hidden states of analystic computation """
-
-    def __nice__(self):
-        return ub.repr2(self, nl=0)
-
-    def __str__(self):
-        return ub.NiceRepr.__str__(self)
-
-    def __repr__(self):
-        return ub.NiceRepr.__repr__(self)
-
-    def __setitem__(self, key, value):
-        if getattr(value, 'hidden', None) is not None:
-            # When setting a value to an OutputShape object, if that object has
-            # a hidden shape, then use that instead.
-            value = value.hidden
-        return OrderedDict.__setitem__(self, key, value)
-
-    def shallow(self, n=1):
-        """
-        Grabs only the shallowest n layers of hidden shapes
-        """
-        if n == 0:
-            last = self
-            while hasattr(last, 'shallow'):
-                values = list(last.values())
-                if len(values):
-                    last = values[-1]
-                else:
-                    break
-            return last
-        else:
-            output = OrderedDict()
-            for key, value in self.items():
-                # if isinstance(value, HiddenShapes):
-                if hasattr(value, 'shallow'):
-                    value = value.shallow(n - 1)
-                output[key] = value
-            return output
-
-
-class OutputFor(object):
-    """
-    Analytic base / identity class
-    """
-    def __init__(self, func):
-        self.func = func
-
-    def __call__(self, *args, **kw):
-        return self.func(*args, **kw)
-
-
-class Output(object):
-    """
-    Analytic base / identity class
-    """
-    @classmethod
-    def coerce(cls, data=None, hidden=None):
-        return data
-
-
-class ForwardFor(OutputFor):
-    """
-    Analytic version of forward functions
-    """
-    def __init__(self, func):
-        self.func = func
-
-    def __call__(self, *args, **kw):
-        return self.func(*args, **kw)
-
-    @staticmethod
-    def getitem(arr):
-        """
-        Wraps getitem calls
-
-        Example:
-            >>> import torch
-            >>> arr = torch.rand(2, 16, 2, 2)
-            >>> result = ForwardFor.getitem(arr)[:, 0:4]
-            >>> assert result.shape == (2, 4, 2, 2)
-        """
-        return _ForwardGetItem(arr)
-
-    @staticmethod
-    def view(arr, *args):
-        """
-        Wraps view calls
-
-        Example:
-            >>> import torch
-            >>> arr = torch.rand(2, 16, 2, 2)
-            >>> result = ForwardFor.view(arr, -1)
-        """
-        return arr.view(*args)
-
-    @staticmethod
-    def shape(arr):
-        """
-        Wraps shape calls
-
-        Example:
-            >>> import torch
-            >>> arr = torch.rand(2, 16, 2, 2)
-            >>> result = ForwardFor.shape(arr)
-        """
-        return arr.shape
-
-    @staticmethod
-    def add(arr1, arr2):
-        return arr1 + arr2
-
-    @staticmethod
-    def mul(arr1, arr2):
-        return arr1 * arr2
-
-    @staticmethod
-    def sub(arr1, arr2):
-        return arr1 - arr2
-
-    @staticmethod
-    def div(arr1, arr2):
-        return arr1 - arr2
-
-
-class _ForwardGetItem(object):
-    def __init__(self, inp):
-        self.inp = inp
-
-    def __getitem__(self, slices):
-        return self.inp.__getitem__(slices)
+import warnings
+warnings.warn('Deprecated file. Use netharn.analytic.analytic_for instead', UserWarning)
+from netharn.analytic.analytic_for import *  # NOQA
diff --git a/netharn/api.py b/netharn/api.py
index 6e749cb363c8c1e66914521c971dc13abdb6da15..ff161c26b45a59c694e4ab58152e890762f035e5 100644
--- a/netharn/api.py
+++ b/netharn/api.py
@@ -167,9 +167,37 @@ class Optimizer(object):
     def coerce(config={}, **kw):
         """
         Accepts keywords:
-            optimizer / optim
-            learning_rate / lr
-            weight_decay / decay
+            optimizer / optim :
+                can be sgd, adam, adamw, rmsprop
+
+            learning_rate / lr :
+                a float
+
+            weight_decay / decay :
+                a float
+
+            momentum:
+                a float, only used if the optimizer accepts it
+
+        Notes:
+            pip install torch-optimizer
+
+        References:
+            https://datascience.stackexchange.com/questions/26792/difference-between-rmsprop-with-momentum-and-adam-optimizers
+            https://github.com/jettify/pytorch-optimizer
+
+        Example:
+            >>> config = {'optimizer': 'sgd'}
+            >>> optim_ = Optimizer.coerce(config)
+
+            >>> # xdoctest: +REQUIRES(module:torch_optimizer)
+            >>> from netharn.api import *  # NOQA
+            >>> config = {'optimizer': 'DiffGrad'}
+            >>> optim_ = Optimizer.coerce(config)
+            >>> print('optim_ = {!r}'.format(optim_))
+            >>> config = {'optimizer': 'Yogi'}
+            >>> optim_ = Optimizer.coerce(config)
+            >>> print('optim_ = {!r}'.format(optim_))
         """
         import netharn as nh
         _update_defaults(config, kw)
@@ -189,12 +217,18 @@ class Optimizer(object):
             optim_ = (torch.optim.Adam, {
                 'lr': lr,
                 'weight_decay': decay,
+                # 'betas': (0.9, 0.999),
+                # 'eps': 1e-8,
+                # 'amsgrad': False
             })
         elif key == 'adamw':
             if _TORCH_IS_GE_1_2_0:
                 from torch.optim import AdamW
                 optim_ = (AdamW, {
                     'lr': lr,
+                    # 'betas': (0.9, 0.999),
+                    # 'eps': 1e-8,
+                    # 'amsgrad': False
                 })
             else:
                 optim_ = (nh.optimizers.AdamW, {
@@ -208,7 +242,36 @@ class Optimizer(object):
                 'alpha': 0.9,
             })
         else:
-            raise KeyError(key)
+            try:
+                import torch_optimizer
+            except Exception:
+                torch_optimizer = None
+                raise KeyError(key)
+            else:
+
+                known = ['AccSGD', 'AdaBound', 'AdaMod', 'DiffGrad', 'Lamb',
+                         'Lookahead', 'NovoGrad', 'RAdam', 'SGDW', 'Yogi']
+
+                from netharn.util import util_inspect
+                if 0:
+                    for key in known:
+                        cls = getattr(torch_optimizer, key, None)
+                        print('cls = {!r}'.format(cls))
+                        defaultkw = util_inspect.default_kwargs(cls)
+                        print('defaultkw = {!r}'.format(defaultkw))
+
+                _lut = {k.lower(): k for k in known}
+                key = _lut[key]
+
+                cls = getattr(torch_optimizer, key, None)
+                if cls is not None:
+                    defaultkw = util_inspect.default_kwargs(cls)
+                    kw = defaultkw.copy()
+                    kw.update()
+                    optim_ = (cls, kw)
+                else:
+                    raise KeyError(key)
+
         return optim_
 
 
@@ -284,6 +347,22 @@ class Scheduler(object):
             for scheduler == exponential:
                 gamma
                 stepsize
+
+            scheduler accepts several special strings which involves a keyword
+            followed by a special coded string that can be used to modify
+            parameters. Some examples:
+
+                step-10-30-50-100 - multiply LR by 0.1 at every point
+
+                onecycle90 - a cyclic scheduler peaking at the epoch 90 // 2
+
+                onecycle90-p0.2 - a cyclic scheduler peaking at the int(90 * 0.2)
+
+                ReduceLROnPlateau-p2-c2 - a ReduceLROnPlateau scheduler with
+                    a patience of 2 and a cooldown of 2
+
+                Exponential-g0.98-s1 - exponential decay of 0.98 every 1-th
+                    epoch
         """
         import netharn as nh
         import parse
@@ -291,21 +370,43 @@ class Scheduler(object):
         key = config.get('scheduler', config.get('schedule', 'step90'))
         lr = config.get('learning_rate', config.get('lr', 3e-3))
 
-        result = parse.parse('onecycle{:d}', key)
-        if result:
+        if key.startswith('onecycle'):
+            result = parse.parse('onecycle{:d}-{}', key)
             size = result.fixed[0]
+            suffix = result.fixed[1]
+
+            parts = suffix.split('-')
+            kw = {
+                'peak': size // 2,
+            }
+            try:
+                for part in parts:
+                    if not part:
+                        continue
+                    if part.startswith('p'):
+                        valstr = part[1:]
+                        if valstr.startswith('0.'):
+                            kw['peak'] = int(size * float(valstr))
+                        else:
+                            kw['peak'] = int(valstr)
+                    else:
+                        raise ValueError('unknown {} part'.format(suffix))
+            except Exception:
+                raise ValueError('Unable to parse {} specs: {}'.format(
+                    result, suffix))
+
             scheduler_ = (nh.schedulers.ListedScheduler, {
                 'points': {
                     'lr': {
                         size * 0   : lr * 0.1,
-                        size // 2  : lr * 1.0,
+                        kw['peak'] : lr * 1.0,
                         size * 1   : lr * 0.01,
-                        size + 1   : lr * 0.0001,
+                        size + 1   : lr * 0.001,
                     },
                     'momentum': {
                         size * 0   : 0.95,
-                        size // 2  : 0.85,
-                        size * 1   : 0.98,
+                        kw['peak'] : 0.90,
+                        size * 1   : 0.95,
                         size + 1   : 0.999,
                     },
                 },
@@ -316,11 +417,17 @@ class Scheduler(object):
         if key.lower().startswith(prefix):
             # Allow step to specify `-` separated step points
             suffix = key[len(prefix):]
-            points = [int(p) for p in suffix.split('-') if p]
+            param_parts = suffix.split('-')
+            if param_parts and param_parts[-1].startswith('f'):
+                factor = float(param_parts[-1][1:])
+                param_parts = param_parts[:-1]
+            else:
+                factor = 10
+            points = [int(p) for p in param_parts if p]
             assert sorted(points) == points, 'points must be in order'
             lr_pts = {0: lr}
             for i, epoch in enumerate(points, start=1):
-                lr_pts[epoch] = lr / (10 ** i)
+                lr_pts[epoch] = lr / (factor ** i)
 
             scheduler_ = (nh.schedulers.ListedScheduler, {
                 'points': {
diff --git a/netharn/criterions/focal.py b/netharn/criterions/focal.py
index 8df138f9304ffb7db2b213285541053e34304145..37bb0b60684707c3dc28475ecd03c457057e84ca 100644
--- a/netharn/criterions/focal.py
+++ b/netharn/criterions/focal.py
@@ -46,7 +46,7 @@ def _backwards_compat_reduction_kw(size_average, reduce, reduction):
 
 
 def focal_loss(input, target, focus, dim=1, weight=None, ignore_index=None,
-               reduction=ELEMENTWISE_MEAN):
+               reduction='mean'):
     """
     Functional version of `FocalLoss`
     """
@@ -57,6 +57,65 @@ def focal_loss(input, target, focus, dim=1, weight=None, ignore_index=None,
     return output
 
 
+def _nll_focal_loss2():
+    pass
+
+
+def _kuangliu_focal_loss(x, y):
+    '''Focal loss.
+    Args:
+      x: (tensor) sized [N,D].
+      y: (tensor) sized [N,].
+    Return:
+      (tensor) focal loss.
+
+
+    Ignore:
+        >>> C = 3
+        >>> dim = 1
+        >>> pred = x = logits = torch.rand(10, C)
+        >>> target = y = targets = (torch.rand(10) * C).long()
+        >>> l1 = _kuangliu_focal_loss(logits, targets)
+        >>> l2 = _kuangliu_focal_loss_alt(logits, targets)
+        >>> print('l1 = {!r}'.format(l1))
+        >>> print('l2 = {!r}'.format(l2))
+    '''
+    alpha = 0.25
+    gamma = 2
+
+    num_classes = x.shape[1]
+    t = kwarray.one_hot_embedding(y, num_classes)  # [N,21]
+    # t = t[:, 1:]  # exclude background
+    # t = t.cuda()  # [N,20]
+
+    p = x.sigmoid()
+    pt = p * t + (1 - p) * (1 - t)         # pt = p if t > 0 else 1-p
+    w = alpha * t + (1 - alpha) * (1 - t)  # w = alpha if t > 0 else 1-alpha
+    w = w * (1 - pt).pow(gamma)
+    return F.binary_cross_entropy_with_logits(x, t, w, reduction='mean')
+
+
+def _kuangliu_focal_loss_alt(x, y):
+    '''Focal loss alternative.
+    Args:
+      x: (tensor) sized [N,D].
+      y: (tensor) sized [N,].
+    Return:
+      (tensor) focal loss.
+    '''
+    alpha = 0.25
+
+    num_classes = x.shape[1]
+    t = kwarray.one_hot_embedding(y, num_classes)
+
+    xt = x * (2 * t - 1)  # xt = x if t > 0 else -x
+    pt = (2 * xt + 1).sigmoid()
+
+    w = alpha * t + (1 - alpha) * (1 - t)
+    loss = -w * pt.log() / 2
+    return loss.mean()
+
+
 def nll_focal_loss(log_probs, targets, focus, dim=1, weight=None,
                    ignore_index=None, reduction='none'):
     r"""
@@ -92,67 +151,6 @@ def nll_focal_loss(log_probs, targets, focus, dim=1, weight=None,
         >>> dim = 1
         >>> ignore_index = 0
         >>> output = nll_focal_loss(log_probs, targets, focus, dim, weight, ignore_index)
-
-    Benchmark:
-        >>> from netharn.criterions.focal import *
-        >>> import ubelt as ub
-        >>> import torch.nn.functional as F
-        >>> import netharn as nh
-        >>> B, C = 16, 37
-        >>> DIMS = (128, 128)
-        >>> dim = 1
-        >>> inputs = torch.rand(B, C, *DIMS)
-        >>> inputs.requires_grad = True
-        >>> log_probs = F.log_softmax(inputs, dim=dim)
-        >>> targets = (torch.rand(B, *DIMS) * C).long()
-        >>> #
-        >>> ti = ub.Timerit(20, bestof=3, verbose=1, unit='us')
-        >>> #
-        >>> devices = [
-        >>>     nh.XPU.coerce('cuda0'),
-        >>>     nh.XPU.coerce('cpu'),
-        >>> ]
-        >>> #
-        >>> # Forward
-        >>> for xpu in devices:
-        >>>     log_probs = xpu.move(log_probs)
-        >>>     targets = xpu.move(targets)
-        >>>     print(' --- FORWARD ---')
-        >>>     print('\n\n--- xpu = {!r} ---\n'.format(xpu))
-        >>>     for timer in ti.reset('F.nll_loss'):
-        >>>         with timer:
-        >>>             loss1 = F.nll_loss(log_probs, targets, reduction='none')
-        >>>             torch.cuda.synchronize()
-        >>>     for timer in ti.reset('nll_focal_loss(focus=0)'):
-        >>>         with timer:
-        >>>             loss2 = nll_focal_loss(log_probs, targets, focus=0, dim=dim)
-        >>>             torch.cuda.synchronize()
-        >>>     for timer in ti.reset('nll_focal_loss(focus=2)'):
-        >>>         with timer:
-        >>>             loss3 = nll_focal_loss(log_probs, targets, focus=2, dim=dim)
-        >>>             torch.cuda.synchronize()
-        >>> #
-        >>> # Backward
-        >>> ti = ub.Timerit(5, bestof=1, verbose=1, unit='ms')
-        >>> log_probs = F.log_softmax(inputs, dim=dim)
-        >>> for xpu in devices:
-        >>>     print(' --- BACKWARD ---')
-        >>>     print('\n\n--- xpu = {!r} ---\n'.format(xpu))
-        >>>     for timer in ti.reset('F.nll_loss'):
-        >>>         with timer:
-        >>>             loss1 = F.nll_loss(log_probs, targets, reduction='none')
-        >>>         loss1.mean().backward(retain_graph=True)
-        >>>         torch.cuda.synchronize()
-        >>>     for timer in ti.reset('nll_focal_loss(focus=0)'):
-        >>>         with timer:
-        >>>             loss2 = nll_focal_loss(log_probs, targets, focus=0.0, dim=dim)
-        >>>         loss2.mean().backward(retain_graph=True)
-        >>>         torch.cuda.synchronize()
-        >>>     for timer in ti.reset('nll_focal_loss(focus=2)'):
-        >>>         with timer:
-        >>>             loss3 = nll_focal_loss(log_probs, targets, focus=2.0, dim=dim)
-        >>>         loss3.mean().backward(retain_graph=True)
-        >>>         torch.cuda.synchronize()
     """
     if focus == 0 and dim == 1:
         # In this case nll_focal_loss is nll_loss, but nll_loss is faster
@@ -199,6 +197,67 @@ def nll_focal_loss(log_probs, targets, focus, dim=1, weight=None,
     return output
 
 
+def _benchmark_focal_loss():
+    import ubelt as ub
+    import torch.nn.functional as F
+    import netharn as nh
+    B, C = 16, 37
+    DIMS = (128, 128)
+    dim = 1
+    inputs = torch.rand(B, C, *DIMS)
+    inputs.requires_grad = True
+    log_probs = F.log_softmax(inputs, dim=dim)
+    targets = (torch.rand(B, *DIMS) * C).long()
+    #
+    ti = ub.Timerit(20, bestof=3, verbose=1, unit='us')
+    #
+    devices = [
+        nh.XPU.coerce('cuda0'),
+        nh.XPU.coerce('cpu'),
+    ]
+    #
+    # Forward
+    for xpu in devices:
+        log_probs = xpu.move(log_probs)
+        targets = xpu.move(targets)
+        print(' --- FORWARD ---')
+        print('\n\n--- xpu = {!r} ---\n'.format(xpu))
+        for timer in ti.reset('F.nll_loss'):
+            with timer:
+                loss1 = F.nll_loss(log_probs, targets, reduction='none')
+                torch.cuda.synchronize()
+        for timer in ti.reset('nll_focal_loss(focus=0)'):
+            with timer:
+                loss2 = nll_focal_loss(log_probs, targets, focus=0, dim=dim)
+                torch.cuda.synchronize()
+        for timer in ti.reset('nll_focal_loss(focus=2)'):
+            with timer:
+                loss3 = nll_focal_loss(log_probs, targets, focus=2, dim=dim)
+                torch.cuda.synchronize()
+    #
+    # Backward
+    ti = ub.Timerit(5, bestof=1, verbose=1, unit='ms')
+    log_probs = F.log_softmax(inputs, dim=dim)
+    for xpu in devices:
+        print(' --- BACKWARD ---')
+        print('\n\n--- xpu = {!r} ---\n'.format(xpu))
+        for timer in ti.reset('F.nll_loss'):
+            with timer:
+                loss1 = F.nll_loss(log_probs, targets, reduction='none')
+            loss1.mean().backward(retain_graph=True)
+            torch.cuda.synchronize()
+        for timer in ti.reset('nll_focal_loss(focus=0)'):
+            with timer:
+                loss2 = nll_focal_loss(log_probs, targets, focus=0.0, dim=dim)
+            loss2.mean().backward(retain_graph=True)
+            torch.cuda.synchronize()
+        for timer in ti.reset('nll_focal_loss(focus=2)'):
+            with timer:
+                loss3 = nll_focal_loss(log_probs, targets, focus=2.0, dim=dim)
+            loss3.mean().backward(retain_graph=True)
+            torch.cuda.synchronize()
+
+
 class FocalLoss(torch.nn.modules.loss._WeightedLoss):
     r"""
     Generalization of ``CrossEntropyLoss`` with a "focus" modulation term.
diff --git a/netharn/data/batch_samplers.py b/netharn/data/batch_samplers.py
index 6ce7ebc9715f0cb0440107145a4a9fa2826da625..88758ad974019bfea7f26cd1adbf61e47f572db2 100644
--- a/netharn/data/batch_samplers.py
+++ b/netharn/data/batch_samplers.py
@@ -1,6 +1,8 @@
 import netharn as nh
 import ubelt as ub
 import torch.utils
+import torch
+import numpy as np
 
 
 class MatchingSamplerPK(ub.NiceRepr, torch.utils.data.sampler.BatchSampler):
@@ -132,3 +134,343 @@ class MatchingSamplerPK(ub.NiceRepr, torch.utils.data.sampler.BatchSampler):
 
     def __len__(self):
         return self.num_batches
+
+
+class BalancedBatchSampler(
+        ub.NiceRepr, torch.utils.data.sampler.BatchSampler):
+    """
+    A sampler for balancing classes amongst batches
+
+    Args:
+        index_to_label (List[int]): the label for each index in a dataset
+        batch_size (int): number of dataset indexes for each batch
+        num_batches (int | str, default='auto'): number of batches to generate
+        quantile (float): interpolates between under and oversamling when
+            num_batches='auto'. A value of 0 is pure undersampling, and a value
+            of 1 is pure oversampling.
+        shuffle (bool, default=False): if True randomize batch ordering
+        drop_last (bool): unused, exists for compatibility
+        rng (RandomState, default=None): random seed
+
+    Example:
+        >>> from netharn.data.batch_samplers import *  # NOQA
+        >>> from netharn.data.batch_samplers import RingSampler  # NOQA
+        >>> import kwarray
+        >>> rng = kwarray.ensure_rng(0)
+        >>> classes = ['class_{}'.format(i) for i in range(5)]
+        >>> # Create a random class label for each item
+        >>> index_to_label = rng.randint(0, len(classes), 100)
+        >>> if 1:
+        >>>     # Create a rare class
+        >>>     index_to_label[0:3] = 42
+        >>> quantile = 0.0
+        >>> self = BalancedBatchSampler(index_to_label, batch_size=4, quantile=quantile, rng=0)
+        >>> print('self.label_to_freq = {!r}'.format(self.label_to_freq))
+        >>> indices = list(self)
+        >>> print('indices = {!r}'.format(indices))
+        >>> # Print the epoch / item label frequency per epoch
+        >>> label_sequence = []
+        >>> index_sequence = []
+        >>> for item_indices in self:
+        >>>     item_indices = np.array(item_indices)
+        >>>     item_labels = index_to_label[item_indices]
+        >>>     index_sequence.extend(item_indices)
+        >>>     label_sequence.extend(item_labels)
+        >>> label_hist = ub.dict_hist(label_sequence)
+        >>> index_hist = ub.dict_hist(index_sequence)
+        >>> label_hist = ub.sorted_vals(label_hist, reverse=True)
+        >>> index_hist = ub.sorted_vals(index_hist, reverse=True)
+        >>> index_hist = ub.dict_subset(index_hist, list(index_hist.keys())[0:5])
+        >>> print('label_hist = {}'.format(ub.repr2(label_hist, nl=1)))
+        >>> print('index_hist = {}'.format(ub.repr2(index_hist, nl=1)))
+    """
+
+    def __init__(self, index_to_label, batch_size=1, num_batches='auto',
+                 quantile=0.5, shuffle=False, rng=None):
+        import kwarray
+
+        rng = kwarray.ensure_rng(rng, api='python')
+        label_to_indices = kwarray.group_items(
+            np.arange(len(index_to_label)), index_to_label)
+
+        label_to_freq = ub.map_vals(len, label_to_indices)
+
+        label_to_subsampler = {
+            label: RingSampler(indices, shuffle=shuffle, rng=rng)
+            for label, indices in label_to_indices.items()
+        }
+
+        self.label_to_freq = label_to_freq
+        self.index_to_label = index_to_label
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.rng = rng
+        self.label_to_indices = label_to_indices
+        self.label_to_subsampler = label_to_subsampler
+
+        if num_batches == 'auto':
+            self.num_batches = self._auto_num_batches(quantile)
+        else:
+            self.num_batches = num_batches
+
+        self.labels = list(self.label_to_indices.keys())
+
+    def __nice__(self):
+        return ub.repr2({
+            'num_batches': self.num_batches,
+            'batch_size': self.batch_size,
+        }, nl=0)
+
+    def _auto_num_batches(self, quantile):
+        # Over / under sample each class depending on the balance factor
+        label_freq = sorted(self.label_to_freq.values())
+        # if 'idf':
+        #     TODO: idf balancing
+        #     N = len(self.index_to_label)
+        #     label_to_idf = ub.map_vals(lambda x: N / x, self.label_to_freq)
+        #     denom = sum(label_to_idf.values())
+        #     label_to_prob = ub.map_vals(lambda x: x / denom, label_to_idf)
+        # How many times will we sample each category?
+        samples_per_label = np.quantile(label_freq, quantile)
+        # Compute #items as seen per epoch, and #batches from that
+        epoch_items = samples_per_label * len(label_freq)
+        num_batches = max(1, int(round(epoch_items / self.batch_size)))
+        return num_batches
+
+    def __len__(self):
+        return self.num_batches
+
+    def __iter__(self):
+        for index in range(self.num_batches):
+            yield self[index]
+
+    def __getitem__(self, index):
+        # Choose a label for each item in the batch
+        if not hasattr(self.rng, 'choices'):
+            # python 3.5 support
+            chosen_labels = [self.rng.choice(self.labels)
+                             for _ in range(self.batch_size)]
+        else:
+            chosen_labels = self.rng.choices(self.labels, k=self.batch_size)
+        # Count the number of items we need for each label
+        label_freq = ub.dict_hist(chosen_labels)
+
+        # Sample those indices
+        batch_idxs = list(ub.flatten([
+            self.label_to_subsampler[label].sample(num)
+            for label, num in label_freq.items()
+        ]))
+        return batch_idxs
+
+
+class GroupedBalancedBatchSampler(ub.NiceRepr, torch.utils.data.sampler.BatchSampler):
+    """
+    Show items containing less frequent categories more often
+
+    Args:
+        index_to_labels (List[Listint]]): the labels for each index in a dataset
+        batch_size (int): number of dataset indexes for each batch
+        num_batches (int | str, default='auto'): number of batches to generate
+        shuffle (bool, default=False): if True randomize batch ordering
+        drop_last (bool): unused, exists for compatibility
+        rng (RandomState, default=None): random seed
+
+    References:
+        https://arxiv.org/pdf/1908.09492.pdf
+
+    Example:
+        >>> from netharn.data.batch_samplers import *  # NOQA
+        >>> import kwarray
+        >>> rng = kwarray.ensure_rng(0)
+        >>> classes = ['class_{}'.format(i) for i in range(10)]
+        >>> # Create a set of random classes for each item
+        >>> index_to_labels = [rng.randint(0, len(classes), rng.randint(10))
+        >>>                   for _ in range(1000)]
+        >>> # Create a rare class
+        >>> index_to_labels[0][0] = 42
+        >>> self = GroupedBalancedBatchSampler(index_to_labels, batch_size=4)
+        >>> print('self.label_to_freq = {!r}'.format(self.label_to_freq))
+        >>> indices = list(self)
+        >>> print('indices = {!r}'.format(indices))
+        >>> # Print the epoch / item label frequency per epoch
+        >>> label_sequence = []
+        >>> index_sequence = []
+        >>> for item_indices in self:
+        >>>     item_indices = np.array(item_indices)
+        >>>     item_labels = list(ub.flatten(ub.take(index_to_labels, item_indices)))
+        >>>     index_sequence.extend(item_indices)
+        >>>     label_sequence.extend(item_labels)
+        >>> label_hist = ub.dict_hist(label_sequence)
+        >>> index_hist = ub.dict_hist(index_sequence)
+        >>> label_hist = ub.sorted_vals(label_hist, reverse=True)
+        >>> index_hist = ub.sorted_vals(index_hist, reverse=True)
+        >>> index_hist = ub.dict_subset(index_hist, list(index_hist.keys())[0:5])
+        >>> print('label_hist = {}'.format(ub.repr2(label_hist, nl=1)))
+        >>> print('index_hist = {}'.format(ub.repr2(index_hist, nl=1)))
+    """
+
+    def __init__(self, index_to_labels, batch_size=1, num_batches='auto',
+                 shuffle=False, rng=None):
+        import kwarray
+
+        rng = kwarray.ensure_rng(rng, api='python')
+        label_to_indices = ub.ddict(set)
+
+        flat_groups = []
+        for index, item_labels in enumerate(index_to_labels):
+            flat_groups.extend([index] * len(item_labels))
+            for label in item_labels:
+                label_to_indices[label].add(index)
+        flat_labels = np.hstack(index_to_labels)
+        self.label_to_freq = ub.dict_hist(flat_labels)
+
+        # Use tf-idf based scheme to compute sample probabilities
+        label_to_tfidf = {}
+        labels = sorted(set(flat_labels))
+        for label in labels:
+            index_to_tf = np.zeros(len(index_to_labels))
+            for index, item_labels in enumerate(index_to_labels):
+                index_to_tf[index] = (label == item_labels).sum()
+            idf = len(index_to_tf) / (index_to_tf > 0).sum()
+            label_to_tfidf[label] = np.maximum(index_to_tf * idf, 1)
+        index_to_weight = sum(label_to_tfidf.values())
+        index_to_prob = index_to_weight / index_to_weight.sum()
+
+        self.index_to_prob = index_to_prob
+        self.indices = np.arange(len(index_to_prob))
+
+        if num_batches == 'auto':
+            self.num_batches = self._auto_num_batches()
+        else:
+            self.num_batches = num_batches
+
+        self.index_to_labels = index_to_labels
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.rng = kwarray.ensure_rng(rng, api='numpy')
+
+    def __nice__(self):
+        return ub.repr2({
+            'num_batches': self.num_batches,
+            'batch_size': self.batch_size,
+            'label_to_freq': self.label_to_freq,
+        }, nl=0)
+
+    def _auto_num_batches(self):
+        # The right way to calculate num samples would be using a generalized
+        # solutions to the coupon collector problem, but in practice that
+        # expected number of samples will be too large for imbalanced datasets.
+        # Therefore we punt and simply use heuristics.
+        num_batches = len(self.index_to_prob)
+        # else:
+        #     raise NotImplementedError(balance)
+        # def nth_harmonic(n):
+        #     """
+        #     Example:
+        #         >>> n = 10
+        #         >>> want = float(sympy.harmonic(n))
+        #         >>> got = nth_harmonic(n)
+        #         >>> np.isclose(want, got)
+        #     """
+        #     return np.sum(1 / np.arange(1, n + 1))
+
+        # def uniform_coupon_ev(n):
+        #     ev = n * nth_harmonic(n)
+        #     return ev
+
+        # def uniform_coupon_ev_to_collect_k(n, k):
+        #     i = np.arange(n)
+        #     prob_new = (n - i + 1) / n
+        #     ev_new = 1 / prob_new
+        #     ev = np.sum(ev_new[0:k])
+        #     return ev
+
+        # n = 100
+        # uniform_coupon_ev_to_collect_k(n, int(0.6 * n))
+        # n / np.arange(1, n + 1)[::-1]
+        # ev_uniform = uniform_coupon_ev(len(self.index_to_prob))
+        return num_batches
+
+    def __getitem__(self, index):
+        # Hack, within each batch we are going to prevent replacement
+        batch_idxs = self.rng.choice(
+            self.indices, p=self.index_to_prob, replace=False,
+            size=self.batch_size)
+        return batch_idxs
+
+    def __iter__(self):
+        for index in range(self.num_batches):
+            yield self[index]
+
+    def __len__(self):
+        return self.num_batches
+
+
+class RingSampler(object):
+    """
+    Stateful sampling without replacement until all item are exhausted
+
+    Example:
+        >>> from netharn.data.batch_samplers import RingSampler  # NOQA
+        >>> self = RingSampler(list(range(1, 4)))
+        >>> sampled_items = self.sample(7)
+        >>> print('sampled_items = {!r}'.format(sampled_items))
+        sampled_items = array([1, 2, 3, 1, 2, 3, 1])
+
+        >>> self = RingSampler(list(range(1, 4)), rng=0, shuffle=True)
+        >>> sampled_items = self.sample(7)
+        >>> print('sampled_items = {!r}'.format(sampled_items))
+        sampled_items = array([3, 2, 1, 1, 3, 2, 1])
+    """
+    def __init__(self, items, shuffle=False, rng=None):
+        import kwarray
+        if len(items) == 0:
+            raise Exception('no items to sample')
+        self.rng = kwarray.ensure_rng(rng)
+        self.items = np.array(items)
+        self.shuffle = shuffle
+        self.indices = np.arange(len(items))
+        self._pos = None
+        self.refresh()
+
+    def refresh(self):
+        import kwarray
+        self._pos = 0
+        if self.shuffle:
+            self.indices = kwarray.shuffle(self.indices, rng=self.rng)
+
+    def sample_indices(self, size=None):
+        """
+        Sample indexes into the items array
+        """
+        n_need = size
+        if size is None:
+            n_need = 1
+        n_total = len(self.indices)
+        idx_accum = []
+        while n_need > 0:
+            # Take as many as we need or as many as we have
+            n_avail = (n_total - self._pos)
+            n_got = min(n_need, n_avail)
+            n_need -= n_got
+
+            idxs = self.indices[self._pos:self._pos + n_got]
+            idx_accum.append(idxs.copy())
+
+            # Update state, if we have exhausted all items, then refresh
+            self._pos += n_got
+            if self._pos == n_total:
+                self.refresh()
+
+        sampled_idxs = np.hstack(idx_accum)
+        if size is None:
+            sampled_idxs = sampled_idxs[0]
+        return sampled_idxs
+
+    def sample(self, size=None):
+        """
+        Sample items from the items array
+        """
+        sampled_idxs = self.sample_indices(size)
+        sampled_items = self.items[sampled_idxs]
+        return sampled_items
diff --git a/netharn/data/grab_voc.py b/netharn/data/grab_voc.py
index 74ba06c5e83b3171af6515def3a0b4b0e70885c4..f88742c469609f30a81be7969cb4e32a88166663 100644
--- a/netharn/data/grab_voc.py
+++ b/netharn/data/grab_voc.py
@@ -49,22 +49,20 @@ def convert_voc_to_coco():
 
     t = ndsampler.CocoDataset.union(t1, t2)
     t.tag = 'voc-train'
-    reroot_imgs(t, root)
     t.fpath = join(root, t.tag + '.mscoco.json')
-    print('t.fpath = {!r}'.format(t.fpath))
-    t.dump(t.fpath, newlines=True)
 
     v = ndsampler.CocoDataset.union(v1, v2)
     v.tag = 'voc-val'
-    reroot_imgs(v, root)
     v.fpath = join(root, v.tag + '.mscoco.json')
-    print('v.fpath = {!r}'.format(v.fpath))
-    v.dump(v.fpath, newlines=True)
 
     tv = ndsampler.CocoDataset.union(t1, t2, v1, v2)
     tv.tag = 'voc-trainval'
-    reroot_imgs(tv, root)
     tv.fpath = join(root, tv.tag + '.mscoco.json')
+
+    print('t.fpath = {!r}'.format(t.fpath))
+    t.dump(t.fpath, newlines=True)
+    print('v.fpath = {!r}'.format(v.fpath))
+    v.dump(v.fpath, newlines=True)
     print('tv.fpath = {!r}'.format(tv.fpath))
     tv.dump(tv.fpath, newlines=True)
     if 0:
@@ -102,7 +100,7 @@ def _convert_voc_split(devkit_dpath, classes, split, year, root):
         tree = ET.parse(apath)
         troot = tree.getroot()
 
-        top_level = troot.getchildren()
+        top_level = list(troot)
 
         unknown = {e.tag for e in top_level} - KNOWN
         assert not unknown
@@ -115,7 +113,7 @@ def _convert_voc_split(devkit_dpath, classes, split, year, root):
             'segmented': int(tree.find('segmented').text),
             'source': {
                 elem.tag: elem.text
-                for elem in tree.find('source').getchildren()
+                for elem in list(tree.find('source'))
             },
         }
 
@@ -125,7 +123,7 @@ def _convert_voc_split(devkit_dpath, classes, split, year, root):
         if owner is not None:
             img['owner'] = {
                 elem.tag: elem.text
-                for elem in owner.getchildren()
+                for elem in list(owner)
             }
 
         gid = dset.add_image(**img)
diff --git a/netharn/examples/__main__.py b/netharn/examples/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1932ac79b586f3f33dce8911ab668b67b563c60
--- /dev/null
+++ b/netharn/examples/__main__.py
@@ -0,0 +1,36 @@
+
+
+def main():
+    import xdoctest
+    import ubelt as ub
+    import sys
+
+    modpath = ub.modname_to_modpath('netharn.examples')
+    name_to_path = {}
+    submods = list(xdoctest.static_analysis.package_modpaths(modpath))
+    for submod in submods:
+        modname = ub.augpath(submod, dpath='', ext='')
+        if not modname.startswith('_'):
+            name_to_path[modname] = submod
+
+    print('name_to_path = {}'.format(ub.repr2(name_to_path, nl=1)))
+
+    chosen = None
+    for arg in sys.argv[1:2]:
+        print('arg = {!r}'.format(arg))
+        if arg in name_to_path:
+            chosen = name_to_path[arg]
+            break
+    print('chosen = {!r}'.format(chosen))
+
+    assert chosen is not None
+    module = ub.import_module_from_path(chosen)
+    print('module = {!r}'.format(module))
+    module.main()
+
+if __name__ == '__main__':
+    """
+    CommandLine:
+        python -m netharn.examples
+    """
+    main()
diff --git a/netharn/examples/cifar.py b/netharn/examples/cifar.py
index 1f3e160f7f258937ea7b7dff2c55d03743f93974..c05af2e03c6354608f37edc5820c3fbc26add7fc 100644
--- a/netharn/examples/cifar.py
+++ b/netharn/examples/cifar.py
@@ -35,11 +35,15 @@ validation, and test sets. In short, netharn handles the necessary parts and
 let the developer focus on the important parts.
 
 
+References:
+    https://github.com/kuangliu/pytorch-cifar
+
+
 CommandLine:
-    python -m netharn.examples.cifar.py --gpu=0 --arch=resnet50
-    python -m netharn.examples.cifar.py --gpu=0 --arch=wrn_22 --lr=0.003 --schedule=onecycle --optim=adamw
-    python -m netharn.examples.cifar.py --gpu=1,2,3 --arch=wrn_22 --lr=0.003 --schedule=onecycle --optim=adamw --batch_size=1800
-    python -m netharn.examples.cifar.py --gpu=1,2 --arch=resnet50 --lr=0.003 --schedule=onecycle --optim=adamw
+    python -m netharn.examples.cifar.py --xpu=0 --arch=resnet50
+    python -m netharn.examples.cifar.py --xpu=0 --arch=wrn_22 --lr=0.003 --schedule=onecycle --optim=adamw
+    python -m netharn.examples.cifar.py --xpu=1,2,3 --arch=wrn_22 --lr=0.003 --schedule=onecycle --optim=adamw --batch_size=1800
+    python -m netharn.examples.cifar.py --xpu=1,2 --arch=resnet50 --lr=0.003 --schedule=onecycle --optim=adamw
 
 """
 import sys
@@ -51,6 +55,7 @@ import os
 import pickle
 import netharn as nh
 import scriptconfig as scfg
+# from netharn.util import layer_rotation
 
 
 class CIFARConfig(scfg.Config):
@@ -68,11 +73,12 @@ class CIFARConfig(scfg.Config):
         'workdir': scfg.Path('~/work/cifar', help='Dump all results in your workdir'),
 
         'workers': scfg.Value(2, help='number of parallel dataloading jobs'),
-        'xpu': scfg.Value('argv', help='See netharn.XPU for details. can be cpu/gpu/cuda0/0,1,2,3)'),
+        'xpu': scfg.Value('auto', help='See netharn.XPU for details. can be auto/cpu/xpu/cuda0/0,1,2,3)'),
 
         'dataset': scfg.Value('cifar10', choices=['cifar10', 'cifar100'],
                               help='which cifar network to use'),
         'num_vali': scfg.Value(0, help='number of validation examples'),
+        'augment': scfg.Value('baseline', help='an augmentation comma separated list or a code'),
 
         'arch': scfg.Value('resnet50', help='Network architecture code'),
         'optim': scfg.Value('sgd', help='Weight optimizer. Can be SGD, ADAM, ADAMW, etc..'),
@@ -87,7 +93,10 @@ class CIFARConfig(scfg.Config):
         'lr': scfg.Value(1e-1, help='Base learning rate'),
         'decay':  scfg.Value(5e-4, help='Base weight decay'),
 
-        'schedule': scfg.Value('simplestep', help=('Special coercable netharn code. Eg: onecycle50, step50, gamma')),
+        'schedule': scfg.Value('step-150-250', help=('Special coercable netharn code. Eg: onecycle50, step50, gamma')),
+
+        'grad_norm_max': scfg.Value(None, help='clip gradients exceeding this value'),
+        'warmup_iters': scfg.Value(0, help='number of iterations to warmup learning rate'),
 
         'init': scfg.Value('noop', help='How to initialized weights. (can be a path to a pretrained model)'),
         'pretrained': scfg.Path(help=('alternative way to specify a path to a pretrained model')),
@@ -235,7 +244,7 @@ class CIFAR_FitHarn(nh.FitHarn):
                 'auc', 'ap', 'mcc', 'brier'
             ])
 
-        # percent error really isn't a great metric, but its standard.
+        # percent error really isn't a great metric, but its easy and standard.
         errors = (y_true != y_pred)
         acc = 1.0 - errors.mean()
         percent_error = (1.0 - acc) * 100
@@ -248,7 +257,7 @@ class CIFAR_FitHarn(nh.FitHarn):
         metrics_dict['percent_error'] = percent_error
         metrics_dict['acc'] = acc
 
-        harn.info('ACC FOR {!r}: {!r}'.format(harn.current_tag, acc))
+        harn.info(ub.color_text('ACC FOR {!r}: {!r}'.format(harn.current_tag, acc), 'yellow'))
 
         # Clear confusion vectors accumulator for the next epoch
         harn._accum_confusion_vectors = {
@@ -290,7 +299,8 @@ class CIFAR_FitHarn(nh.FitHarn):
             min_, max_ = im_.min(), im_.max()
             im_ = ((im_ - min_) / (max_ - min_) * 255).astype(np.uint8)
             im_ = np.ascontiguousarray(im_)
-            im_ = kwimage.imresize(im_, dsize=(200, 200))
+            im_ = kwimage.imresize(im_, dsize=(200, 200),
+                                   interpolation='nearest')
 
             # Draw classification information on the image
             im_ = kwimage.draw_clf_on_image(im_, classes=classes, tcx=tcx,
@@ -302,41 +312,126 @@ class CIFAR_FitHarn(nh.FitHarn):
                                             chunksize=8)
         return stacked
 
+    def before_epochs(harn):
+        if harn.epoch == 0:
+            harn._draw_conv_layers(suffix='_init')
 
-def setup_harn():
-    """
-    Replicates parameters from https://github.com/kuangliu/pytorch-cifar
+    def after_epochs(harn):
+        """
+        Callback after all train/vali/test epochs are complete.
+        """
+        harn._draw_conv_layers()
 
-    The following is a table of kuangliu's reported accuracy and our measured
-    accuracy for each architecture.
+    def _draw_conv_layers(harn, suffix=''):
+        """
+        We use this to visualize the first convolutional layer
+        """
+        import kwplot
+        # Visualize the first convolutional layer
+        dpath = ub.ensuredir((harn.train_dpath, 'monitor', 'layers'))
+        # fig = kwplot.figure(fnum=1)
+        for key, layer in nh.util.trainable_layers(harn.model, names=True):
+            # Typically the first convolutional layer returned here is the
+            # first convolutional layer in the network
+            if isinstance(layer, torch.nn.Conv2d):
+                if max(layer.kernel_size) > 2:
+                    fig = kwplot.plot_convolutional_features(
+                        layer, fnum=1, normaxis=0)
+                    kwplot.set_figtitle(key, subtitle=str(layer), fig=fig)
+                    layer_dpath = ub.ensuredir((dpath, key))
+                    fname = 'layer-{}-epoch_{}{}.jpg'.format(
+                        key, harn.epoch, suffix)
+                    fpath = join(layer_dpath, fname)
+                    fig.savefig(fpath)
+                    break
+
+            if isinstance(layer, torch.nn.Linear):
+                # TODO: visualize the FC layer
+                pass
+
+
+def build_train_augmentors(augment, input_mean):
+    from torchvision import transforms
 
-    The first column is kuangliu's reported accuracy, the second column is me
-    running kuangliu's code, and the final column is using my own training
-    harness (handles logging and whatnot) called netharn.
+    # Define preprocessing + augmentation strategy
+    if isinstance(augment, list):
+        augmentors = augment
+    elif ',' in augment:
+        augmentors = augment.split(',')
+    elif augment == 'baseline':
+        augmentors = ['crop', 'flip']
+    elif augment == 'simple':
+        augmentors = ['crop', 'flip', 'gray', 'cutout']
+    else:
+        raise KeyError(augment)
+
+    pil_augmentors = []
+    tensor_augmentors = []
+
+    if 'crop' in augmentors:
+        pil_augmentors += [
+            transforms.RandomCrop(32, padding=4),
+        ]
+    if 'flip' in augmentors:
+        pil_augmentors += [
+            transforms.RandomHorizontalFlip(),
+        ]
+    if 'gray' in augmentors:
+        pil_augmentors += [
+            transforms.RandomGrayscale(p=0.1),
+        ]
+    if 'jitter' in augmentors:
+        raise NotImplementedError
+        # pil_augmentors += [transforms.RandomChoice([
+        #     transforms.ColorJitter(brightness=(0, .01), contrast=(0, .01),
+        #                            saturation=(0, .01), hue=(-0.01, 0.01),),
+        #     ub.identity,
+        # ])]
+
+    if 'cutout' in augmentors:
+        def cutout(tensor):
+            """
+            Ignore:
+                tensor = torch.rand(3, 32, 32)
+            """
+            # This cutout is closer to the definition in the paper
+            import kwarray
+            rng = kwarray.ensure_rng(None)
+            img_h, img_w = tensor.shape[1:]
+            p = 0.9
+            value = 0
+            scale = 0.5
+            if rng.rand() < p:
+                cx = rng.randint(0, img_w)
+                cy = rng.randint(0, img_h)
+
+                w2 = int((img_w * scale) // 2)
+                h2 = int((img_h * scale) // 2)
+                x1 = max(cx - w2, 0)
+                y1 = max(cy - h2, 0)
+                x2 = min(cx + w2, img_w)
+                y2 = min(cy + h2, img_h)
+
+                sl = (slice(None), slice(y1, y2), slice(x1, x2))
+                tensor[sl] = value
+            return tensor
+        tensor_augmentors += [cutout]
+
+        # tensor_augmentors += [  # Cutout
+        #     transforms.RandomErasing(
+        #         p=0.5, scale=(0.4, 0.4), ratio=(1.0, 1.0),
+        #         value=0, inplace=True),
+        # ]
+    print('pil_augmentors = {!r}'.format(pil_augmentors))
+    print('tensor_augmentors = {!r}'.format(tensor_augmentors))
+    return pil_augmentors, tensor_augmentors
 
-           arch |  kuangliu  | rerun-kuangliu  |  netharn |
-    -------------------------------------------------------
-    ResNet50    |    93.62%  |         95.370% |  95.72%  |
-    DenseNet121 |    95.04%  |         95.420% |  94.47%  |
-    DPN92       |    95.16%  |         95.410% |  94.92%  |
 
-    CommandLine:
-        python -m netharn.examples.cifar --gpu=0 --nice=resnet --arch=resnet50 --optim=sgd --schedule=simplestep --lr=0.1
-        python -m netharn.examples.cifar --gpu=0 --nice=wrn --arch=wrn_22 --optim=sgd --schedule=simplestep --lr=0.1
-        python -m netharn.examples.cifar --gpu=0 --nice=densenet --arch=densenet121 --optim=sgd --schedule=simplestep --lr=0.1
-        python -m netharn.examples.cifar --gpu=0 --nice=efficientnet_scratch --arch=efficientnet-b0 --optim=sgd --schedule=simplestep --lr=0.01 --init=noop --decay=1e-5
-
-        python -m netharn.examples.cifar --gpu=0 --nice=efficientnet \
-            --arch=efficientnet-b0 --optim=rmsprop --lr=0.064 \
-            --batch_size=512 --max_epoch=120 --schedule=Exponential-g0.97-s2
-
-        python -m netharn.examples.cifar --gpu=0 --nice=efficientnet-scratch3 \
-            --arch=efficientnet-b0 --optim=adamw --lr=0.016 --init=noop \
-            --batch_size=1024 --max_epoch=450 --schedule=Exponential-g0.96-s3 --decay=1e-5
-
-        python -m netharn.examples.cifar --gpu=0 --nice=efficientnet-pretrained2 \
-            --arch=efficientnet-b0 --optim=adamw --lr=0.0064 --init=cls \
-            --batch_size=512 --max_epoch=350 --schedule=Exponential-g0.97-s2 --decay=1e-5
+def setup_harn():
+    """
+    This function creates an instance of the custom FitHarness, which involves
+    parsing script configuration parameters, creating a custom torch dataset,
+    and connecting those data and hyperparameters to the FitHarness.
     """
     import random
     import torchvision
@@ -360,24 +455,38 @@ def setup_harn():
         # TODO: ensure the CPU mode is also deterministic
         torch.backends.cudnn.deterministic = config['deterministic']
 
-    # Define preprocessing + augmentation strategy
-    transform_train = transforms.Compose([
-        transforms.RandomCrop(32, padding=4),
-        transforms.RandomHorizontalFlip(),
-        transforms.Resize(config['input_dims']),
-        transforms.ToTensor(),
-        transforms.Normalize((0.4914, 0.4822, 0.4465),
-                             (0.2023, 0.1994, 0.2010)),
-        transforms.RandomErasing(p=0.5, scale=(0.5, 0.5),  # Cutout
-                                 value=0),
-    ])
-
-    transform_test = transforms.Compose([
-        transforms.Resize(config['input_dims']),
-        transforms.ToTensor(),
-        transforms.Normalize((0.4914, 0.4822, 0.4465),
-                             (0.2023, 0.1994, 0.2010)),
-    ])
+    # A more general system could infer (and cache) this from the data
+    input_mean = (0.4914, 0.4822, 0.4465)
+    input_std = (0.2023, 0.1994, 0.2010)
+
+    def common_transform(pil_img):
+        import kwimage
+        hwc255 = np.array(pil_img)
+        hwc01 = hwc255.astype(np.float32)
+        hwc01 /= 255.0
+        if hwc01.shape[0:2] != tuple(config['input_dims']):
+            dsize = config['input_dims'][::-1]
+            hwc01 = kwimage.imresize(hwc01, dsize=dsize,
+                                     interpolation='linear')
+        chw01 = torch.from_numpy(hwc01.transpose(2, 0, 1)).contiguous()
+        return chw01
+
+    common_transforms = [
+        common_transform,
+        # transforms.Resize(config['input_dims'], interpolation),
+        # transforms.ToTensor(),
+        transforms.Normalize(input_mean, input_std, inplace=True),
+    ]
+
+    augment = config['augment']
+    pil_augmentors, tensor_augmentors = build_train_augmentors(
+        augment, input_mean)
+
+    transform_train = transforms.Compose(
+        pil_augmentors + common_transforms + tensor_augmentors
+    )
+
+    transform_test = transforms.Compose(common_transforms)
 
     if config['dataset'] == 'cifar10':
         DATASET = torchvision.datasets.CIFAR10
@@ -451,7 +560,7 @@ def setup_harn():
         datasets[key] = dset
 
     loaders = {
-        key: torch.utils.data.DataLoader(dset, shuffle=key == 'train',
+        key: torch.utils.data.DataLoader(dset, shuffle=(key == 'train'),
                                          num_workers=config['workers'],
                                          batch_size=config['batch_size'],
                                          pin_memory=True)
@@ -463,6 +572,26 @@ def setup_harn():
         import cv2
         cv2.setNumThreads(0)
 
+    if config['optim'] == 'sgd':
+        optimizer_ = (torch.optim.SGD, {
+            'lr': config['lr'],
+            'weight_decay': config['decay'],
+            'momentum': 0.9,
+            'nesterov': True,
+        })
+    elif config['optim'] == 'adamw':
+        optimizer_ = (nh.optimizers.AdamW, {
+            'lr': config['lr'],
+            'betas': (0.9, 0.999),
+            'weight_decay': config['decay'],
+            'amsgrad': False,
+        })
+    else:
+        # The netharn API can construct an optimizer from standard keys in a
+        # configuration dictionary. There is a bit of magic involved. Read docs
+        # for coerce for more details.
+        optimizer_ = nh.api.Optimizer.coerce(config)
+
     # Choose which network architecture to train
     available_architectures = {
         'densenet121': (nh.models.densenet.DenseNet, {
@@ -504,24 +633,90 @@ def setup_harn():
             )
         )
 
+    if config['arch'].startswith('se_resnet18'):
+        from netharn.models import se_resnet
+        model = se_resnet.se_resnet18(
+            num_classes=len(classes),
+        )
+
+    if config['arch'].startswith('se_resnet50'):
+        from netharn.models import se_resnet
+        model = se_resnet.se_resnet50(
+            num_classes=len(classes),
+            pretrained=config['init'] == 'cls',
+        )
+
     if config['arch'].startswith('efficientnet'):
         # Directly create the model instance...
         # (as long as it has an `_initkw` attribute)
         from netharn.models import efficientnet
 
+        zero_gamma = False
         if config['init'] == 'cls':
             model_ = efficientnet.EfficientNet.from_pretrained(
                 config['arch'], override_params={
                     'classes': classes,
-                }
-            )
+                    'noli': 'mish'
+                }, advprop=True)
             print('pretrained cls init')
         else:
             model_ = efficientnet.EfficientNet.from_name(
                 config['arch'], override_params={
                     'classes': classes,
+                    'noli': 'mish'
                 }
             )
+
+        # For efficient nets we need to dramatically reduce the weight decay on
+        # the depthwise part of the depthwise separable convolution.  To do
+        # this we need to manually construct the param groups for the
+        # optimizer.
+        model = model_
+
+        params = dict(model.named_parameters())
+        key_groups = ub.ddict(list)
+
+        seen_ = set()
+        def append_once(group, key):
+            if key not in seen_:
+                key_groups[group].append(key)
+                seen_.add(key)
+
+        if zero_gamma:
+            for key, layer in model.trainable_layers(names=True):
+                if getattr(layer, '_residual_bn', False):
+                    # zero bn after residual layers.
+                    layer.weight.data.fill_(0)
+                    # dont decay batch norm
+                    # append_once('nodecay', key + '.weight')
+
+        for key in params.keys():
+            if key.endswith('.bias'):
+                append_once('nodecay', key)
+            elif 'depthwise_conv' in key:
+                append_once('nodecay', key)
+            else:
+                append_once('default', key)
+
+        named_param_groups = {}
+        for group_name, keys in key_groups.items():
+            if keys:
+                # very important that groups are alway in the same order
+                keys = sorted(keys)
+                param_group = {
+                    'params': list(ub.take(params, keys)),
+                }
+                named_param_groups[group_name] = param_group
+
+        # Override the default weight decay of chosen groups
+        named_param_groups['nodecay']['weight_decay'] = 0
+
+        param_groups = [v for k, v in sorted(named_param_groups.items())]
+
+        optim_cls, optim_kw = optimizer_
+        optim = optim_cls(param_groups, **optim_kw)
+        optim._initkw = optim_kw
+        optimizer_ = optim
     else:
         model_ = available_architectures[config['arch']]
 
@@ -534,41 +729,9 @@ def setup_harn():
         # pretrained initializer.
         initializer_ = nh.api.Initializer.coerce(config)
 
-    if config['schedule'] == 'simplestep':
-        scheduler_ = (nh.schedulers.ListedLR, {
-            'points': {
-                0: config['lr'],
-                150: config['lr'] * 0.1,
-                250: config['lr'] * 0.01,
-            },
-            'interpolate': False
-        })
-    elif config['schedule'] == 'onecycle':
+    if config['schedule'] == 'onecycle':
         # TODO: Fast AI params
         # TODO: https://github.com/fastai/fastai/blob/c7df6a5948bdaa474f095bf8a36d75dbc1ee8e6a/fastai/callbacks/one_cycle.py
-        # config['lr'] = 3e-3
-        # cyc_len=35
-        # max_lr = 3e-3
-        # moms = (0.95,0.85)
-        # div_factor = 25
-        # pct_start=0.3,
-        # wd=0.4
-        # pct = np.linspace(0, 1.0, 35)
-        # cos_up = (np.cos(np.pi * (1 - pct)) + 1) / 2
-        # cos_down = cos_up[::-1]
-
-        # pt1 = config['lr'] / 25.0
-        # pt2 = config['lr']
-        # pt3 = config['lr'] / (1000 * 25.0)
-
-        # phase1 = (pt2 - pt1) * cos_up + pt1
-        # phase2 = (pt2 - pt3) * cos_down + pt3
-        # points = dict(enumerate(ub.flatten([phase1, phase2])))
-
-        # scheduler_ = (nh.schedulers.ListedLR, {
-        #     'points': points,
-        #     'interpolate': False
-        # })
         scheduler_ = (nh.schedulers.ListedScheduler, {
             'points': {
                 'lr': {
@@ -592,26 +755,6 @@ def setup_harn():
         # for coerce for more details.
         scheduler_ = nh.api.Scheduler.coerce(config)
 
-    if config['optim'] == 'sgd':
-        optimizer_ = (torch.optim.SGD, {
-            'lr': config['lr'],
-            'weight_decay': config['decay'],
-            'momentum': 0.9,
-            'nesterov': True,
-        })
-    elif config['optim'] == 'adamw':
-        optimizer_ = (nh.optimizers.AdamW, {
-            'lr': config['lr'],
-            'betas': (0.9, 0.999),
-            'weight_decay': config['decay'],
-            'amsgrad': False,
-        })
-    else:
-        # The netharn API can construct an optimizer from standard keys in a
-        # configuration dictionary. There is a bit of magic involved. Read docs
-        # for coerce for more details.
-        optimizer_ = nh.api.Optimizer.coerce(config)
-
     # Notice that arguments to hyperparameters are typically specified as a
     # tuple of (type, Dict), where the dictionary are the keyword arguments
     # that can be used to instantiate an instance of that class. While
@@ -638,6 +781,7 @@ def setup_harn():
             'minimize': ['loss'],
             'patience': config['patience'],
             'max_epoch': config['max_epoch'],
+            'smoothing': 0.0,
         }),
         initializer=initializer_,
         criterion=(torch.nn.CrossEntropyLoss, {}),
@@ -648,6 +792,7 @@ def setup_harn():
         other={
             # Specify anything else that is special about your hyperparams here
             # Especially if you make a custom_batch_runner
+            'augment': config['augment'],
         },
         # These extra arguments are recorded in the train_info.json but do
         # not contribute to the hyperparameter hash.
@@ -663,6 +808,7 @@ def setup_harn():
     harn.preferences['keyboard_debug'] = True
     harn.preferences['eager_dump_tensorboard'] = True
     harn.preferences['tensorboard_groups'] = ['loss']
+    # harn.preferences['tensorboard_groups']
 
     harn.intervals.update({
         'vali': 1,
@@ -674,6 +820,7 @@ def setup_harn():
 
 
 def main():
+    # Run your code that sets up your custom FitHarn object.
     harn = setup_harn()
 
     # Initializing a FitHarn object can take a little time, but not too much.
@@ -682,6 +829,44 @@ def main():
     # pre-existing checkpoint that we can restart from.
     harn.initialize()
 
+    if ub.argflag('--lrtest'):
+        """
+        python -m netharn.examples.cifar --xpu=0 --arch=efficientnet-b0 \
+                --nice=test_cifar9 --optim=adamw --schedule=Exponential-g0.98 \
+                --lr=0.1 --init=kaiming_normal \
+                --batch_size=2048 --lrtest --show
+
+        python -m netharn.examples.cifar --xpu=0 --arch=efficientnet-b7 \
+                --nice=test_cifar9 --optim=adamw --schedule=Exponential-g0.98 \
+                --lr=0.1 --init=kaiming_normal \
+                --batch_size=256  --lrtest --show
+
+        python -m netharn.examples.cifar --xpu=0 --arch=efficientnet-b7 \
+                --nice=test_cifar9 --optim=adamw --schedule=Exponential-g0.98 \
+                --lr=4e-2 --init=kaiming_normal \
+                --batch_size=256
+        """
+        # Undocumented hidden feature,
+        # Perform an LR-test, then resetup the harness. Optionally draw the
+        # results using matplotlib.
+        from netharn.prefit.lr_tests import lr_range_test
+
+        result = lr_range_test(
+            harn, init_value=1e-4, final_value=0.5, beta=0.3,
+            explode_factor=10, num_iters=200)
+
+        if ub.argflag('--show'):
+            import kwplot
+            plt = kwplot.autoplt()
+            result.draw()
+            plt.show()
+
+        # Recreate a new version of the harness with the recommended LR.
+        config = harn.script_config.asdict()
+        config['lr'] = (result.recommended_lr * 10)
+        harn = setup_harn(**config)
+        harn.initialize()
+
     # This starts the main loop which will run until the monitor's terminator
     # criterion is satisfied. If the initialize step loaded a checkpointed that
     # already met the termination criterion, then this will simply return.
@@ -695,24 +880,123 @@ def main():
 
 if __name__ == '__main__':
     r"""
+    The baseline script replicates parameters from
+    https://github.com/kuangliu/pytorch-cifar
+
+    The following is a table of kuangliu's reported accuracy and our measured
+    accuracy for each architecture.
+
+    The first column is kuangliu's reported accuracy, the second column is me
+    running kuangliu's code, and the final column is using my own training
+    harness (handles logging and whatnot) called netharn.
+
+    The first three experiments are with simple augmentation. The rest have
+    more complex augmentation.
+
+           arch        |  kuangliu  | rerun-kuangliu |  netharn |  train rate | num params
+    ---------------------------------------------------------------------------------------
+    ResNet50           |    93.62%  |        95.370% |  95.72%  |             |
+    DenseNet121        |    95.04%  |        95.420% |  94.47%  |             |
+    DPN92              |    95.16%  |        95.410% |  94.92%  |             |
+    --------------------
+    ResNet50_newaug*   |        --  |             -- |  96.13%  |   498.90 Hz | 23,520,842
+    EfficientNet-7*    |        --  |             -- |  85.36%  |   214.18 Hz | 63,812,570
+    EfficientNet-3*    |        --  |             -- |  86.87%  |   568.30 Hz | 10,711,602
+    EfficientNet-0*    |        --  |             -- |  87.13%  |   964.21 Hz |  4,020,358
+
+    EfficientNet-0-b64-224 |    --  |             -- |  25ish%  |   148.15 Hz |  4,020,358
+    efficientnet0_transfer_b64_sz224_v2 ||           |  98.04%  |
+
+
+   600025177002,
+
+
+    CommandLine:
+        python -m netharn.examples.cifar --xpu=0 --nice=resnet50_baseline --arch=resnet50 --optim=sgd --schedule=step-150-250 --lr=0.1
+        python -m netharn.examples.cifar --xpu=0 --nice=wrn --arch=wrn_22 --optim=sgd --schedule=step-150-250 --lr=0.1
+        python -m netharn.examples.cifar --xpu=0 --nice=densenet --arch=densenet121 --optim=sgd --schedule=step-150-250 --lr=0.1
+
+        python -m netharn.examples.cifar --xpu=0 --nice=se_resnet18 --arch=se_resnet18 --optim=sgd --schedule=step-150-250 --lr=0.01 --init=noop --decay=1e-5 --augment=simple
+
+        python -m netharn.examples.cifar --xpu=0 --nice=resnet50_newaug_b128 --batch_size=128 --arch=resnet50 --optim=sgd --schedule=step-150-250 --lr=0.1 --init=kaiming_normal --augment=simple
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet7_newaug_b128 --batch_size=128 --arch=efficientnet-b7 --optim=sgd --schedule=step-150-250 --lr=0.1 --init=kaiming_normal --augment=simple
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet3_newaug_b128 --batch_size=128 --arch=efficientnet-b3 --optim=sgd --schedule=step-150-250 --lr=0.1 --init=kaiming_normal --augment=simple
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet0_newaug_b128 --batch_size=128 --arch=efficientnet-b0 --optim=sgd --schedule=step-150-250 --lr=0.1 --init=kaiming_normal --augment=simple
+
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet0_transfer_b128_sz32 --batch_size=128 --arch=efficientnet-b0 --optim=sgd --schedule=step-150-250 --lr=0.01 --decay=5e-4 --init=cls --augment="crop,flip,gray,cutout" --input_dims=32,32
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet0_transfer_b64_sz224 --batch_size=64 --arch=efficientnet-b0 --optim=sgd --schedule=step-150-250 --lr=0.01 --decay=5e-4 --init=cls --augment="crop,flip,gray,cutout" --input_dims=224,224
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet0_newaug_b64_sz224 --batch_size=64 --arch=efficientnet-b0 --optim=sgd --schedule=step-150-250 --lr=0.1 --init=kaiming_normal --augment=simple --input_dims=224,224
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet0_transfer_b128_sz32_v2 --batch_size=128 --arch=efficientnet-b0 --optim=sgd --schedule=step-20-45-70-90-f5 --max_epoch=100 --lr=0.01 --decay=5e-4 --init=cls --augment="crop,flip,gray,cutout" --input_dims=32,32  # 88%
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet0_transfer_b128_sz32_v3 --batch_size=128 --arch=efficientnet-b0 --optim=sgd --schedule=step-13-20-45-70-90-f5 --max_epoch=100 --lr=0.01 --decay=5e-4 --init=cls --augment="crop,flip,gray,cutout" --input_dims=32,32
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet0_transfer_b128_sz32_v4 --batch_size=128 --arch=efficientnet-b0 --optim=sgd --schedule=step-10-20-45-70-90-f5 --max_epoch=100 --lr=0.03 --decay=5e-4 --init=cls --augment="crop,flip,gray,cutout" --input_dims=32,32
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet0_transfer_b64_sz224_v2 --batch_size=64 --arch=efficientnet-b0 --optim=sgd --schedule=step-10-20 --max_epoch=100 --lr=0.01 --decay=5e-4 --init=cls --augment="crop,flip,gray,cutout" --input_dims=224,224
+
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet0_newaug_yogi_b1024 \
+                --batch_size=1028 --arch=efficientnet-b0 --optim=Yogi \
+                --schedule=step-60-120-160-250-350-f5 --decay=5e-4 --lr=0.01549 \
+                --init=kaiming_normal --augment=simple --grad_norm_max=35 \
+                --warmup_iters=100
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet1_newaug_diffgrad_b1024 \
+                --batch_size=1028 --arch=efficientnet-b1 --optim=DiffGrad \
+                --schedule=step-60-120-160-250-350-f5 --decay=5e-4 --lr=0.01 \
+                --init=kaiming_normal --augment=simple --grad_norm_max=35 \
+                --warmup_iters=100
+
+
+        # Params from Cutout paper: https://arxiv.org/pdf/1708.04552.pdf
+        python -m netharn.examples.cifar --xpu=0 --nice=repro_cutout \
+                --batch_size=128 \
+                --arch=efficientnet-b0 \
+                --optim=sgd --lr=0.01 --decay=5e-4 \
+                --schedule=step-60-120-160-f5 --max_epoch=200 \
+                --init=kaiming_normal --augment=simple \
+                --grad_norm_max=35 --warmup_iters=100
+
+        python -m netharn.examples.cifar --xpu=0 --nice=repro_cutoutDiffGrad \
+                --batch_size=128 \
+                --arch=efficientnet-b1 \
+                --optim=DiffGrad --lr=0.01 --decay=5e-4 \
+                --schedule=step-60-120-160-f5 --max_epoch=200 \
+                --init=kaiming_normal --augment=simple \
+                --grad_norm_max=35 --warmup_iters=100
+
+        0.015219216761025578
+
+
+        python -m netharn.examples.cifar --xpu=0 --nice=efficientnet7_scratch \
+            --arch=efficientnet-b7 --optim=sgd --schedule=step-150-250-350 \
+            --batch_size=512 --lr=0.01 --init=noop --decay=1e-5
+
     CommandLine:
-        python -m netharn.examples.cifar --gpu=0 --arch=resnet50 --num_vali=0
-        python -m netharn.examples.cifar --gpu=0 --arch=efficientnet-b0 --num_vali=0
+        python -m netharn.examples.cifar --xpu=0 --arch=resnet50 --num_vali=0
+        python -m netharn.examples.cifar --xpu=0 --arch=efficientnet-b0 --num_vali=0
 
-        python -m netharn.examples.cifar --gpu=0 --arch=efficientnet-b0
+        python -m netharn.examples.cifar --xpu=0 --arch=efficientnet-b0
 
         # This next command requires a bit more compute
-        python -m netharn.examples.cifar --gpu=0 --arch=efficientnet-b0 --nice=test_cifar2 --schedule=step-3-6-50 --lr=0.1 --init=cls --batch_size=2718
+        python -m netharn.examples.cifar --xpu=0 --arch=efficientnet-b0 --nice=test_cifar2 --schedule=step-3-6-50 --lr=0.1 --init=cls --batch_size=2718
 
-        python -m netharn.examples.cifar --gpu=0 --arch=efficientnet-b0 --nice=test_cifar3 --schedule=step-3-6-12-16 --lr=0.256 --init=cls --batch_size=3000 --workers=2 --num_vali=0 --optim=rmsprop
+        python -m netharn.examples.cifar --xpu=0 --arch=efficientnet-b0 --nice=test_cifar3 --schedule=step-3-6-12-16 --lr=0.256 --init=cls --batch_size=3000 --workers=2 --num_vali=0 --optim=rmsprop
 
-        python -m netharn.examples.cifar --gpu=0 --arch=efficientnet-b0 --nice=test_cifar3 --schedule=onecycle70 --lr=0.01  --init=cls --batch_size=3000 --workers=2 --num_vali=0 --optim=sgd --datasets=cifar100
+        python -m netharn.examples.cifar --xpu=0 --arch=efficientnet-b0 --nice=test_cifar3 --schedule=onecycle70 --lr=0.01  --init=cls --batch_size=3000 --workers=2 --num_vali=0 --optim=sgd --datasets=cifar100
 
-        python -m netharn.examples.cifar --gpu=0 --arch=efficientnet-b0 --nice=test_cifar2 --schedule=ReduceLROnPlateau-p1-c1-f0.9 --lr=0.1 --init=cls --batch_size=2719 --workers=4 --optim=sgd --datasets=cifar100
+        python -m netharn.examples.cifar --xpu=0 --arch=efficientnet-b0 --nice=test_cifar2 --schedule=ReduceLROnPlateau-p1-c1-f0.9 --lr=0.1 --init=cls --batch_size=2719 --workers=4 --optim=sgd --datasets=cifar100
 
-        python -m netharn.examples.cifar.py --gpu=0 --arch=densenet121
+        python -m netharn.examples.cifar.py --xpu=0 --arch=densenet121
         # Train on two GPUs with a larger batch size
-        python -m netharn.examples.cifar.py --arch=dpn92 --batch_size=256 --gpu=0,1
+        python -m netharn.examples.cifar.py --arch=dpn92 --batch_size=256 --xpu=0,1
     """
     import seaborn
     seaborn.set()
diff --git a/netharn/examples/mnist.py b/netharn/examples/mnist.py
index 391fe9180187a1d5686655e7ad98955ec1524ad4..0821076d3d9c7c32baab302c51c0f10b6bfcde6c 100644
--- a/netharn/examples/mnist.py
+++ b/netharn/examples/mnist.py
@@ -311,7 +311,7 @@ def main():
     # pre-existing checkpoint that we can restart from.
     harn.initialize(reset=reset)
 
-    if ub.argval(('--vd', '--view-directory')):
+    if ub.argflag(('--vd', '--view-directory')):
         ub.startfile(harn.train_dpath)
 
     # This starts the main loop which will run until a the monitor's terminator
@@ -327,7 +327,7 @@ def main():
 if __name__ == '__main__':
     r"""
     CommandLine:
-        python examples/mnist.py
+        python -m netharn.examples.mnist
 
         tensorboard --logdir ~/data/work/mnist/fit/nice
     """
diff --git a/netharn/examples/object_detection.py b/netharn/examples/object_detection.py
index f056fe8720e863e924bc225c270ee510c02cd236..b8d78b2f8bafa9e21320714ff3591e09a64065e0 100644
--- a/netharn/examples/object_detection.py
+++ b/netharn/examples/object_detection.py
@@ -881,6 +881,12 @@ if __name__ == '__main__':
 
         python -m netharn.examples.object_detection --datasets=special:voc
 
+        python -m netharn.examples.object_detection \
+            --datasets=special:shapes1024 \
+            --arch=yolo2 --optim=sgd \
+            --input_dims=512,512 --lr=1e-3 \
+            --workers=4 --xpu=auto --batch_size=4 --bstep=4
+
         python -m netharn.examples.object_detection \
             --nice=voc-detection-demo \
             --train_dataset=~/data/VOC/voc-trainval.mscoco.json \
diff --git a/netharn/examples/segmentation.py b/netharn/examples/segmentation.py
index 603560858570d2012531ade897c8e32cc6c795c1..0f297871ccf19389b4e87fef2bd44307c7486053 100644
--- a/netharn/examples/segmentation.py
+++ b/netharn/examples/segmentation.py
@@ -323,6 +323,7 @@ class SegmentationHarn(nh.FitHarn):
         How to compute a forward pass through the network and compute loss
 
         Example:
+            >>> # xdoctest: +REQUIRES(--slow)
             >>> kw = {'workers': 0, 'xpu': 'cpu', 'batch_size': 2}
             >>> harn = setup_harn(cmdline=False, **kw).initialize()
             >>> batch = harn._demo_batch(tag='train')
@@ -385,11 +386,13 @@ class SegmentationHarn(nh.FitHarn):
     def _draw_batch_preds(harn, batch, outputs, lim=16):
         """
         Example:
+            >>> # xdoctest: +REQUIRES(--slow)
             >>> kw = {'workers': 0, 'xpu': 'cpu', 'batch_size': 8}
             >>> harn = setup_harn(cmdline=False, **kw).initialize()
             >>> batch = harn._demo_batch(tag='train')
             >>> outputs, loss_parts = harn.run_batch(batch)
             >>> toshow = harn._draw_batch_preds(batch, outputs)
+            >>> # xdoctest: +REQUIRES(--show)
             >>> import kwplot
             >>> kwplot.autompl()
             >>> kwplot.imshow(toshow)
@@ -546,6 +549,7 @@ def _cached_class_frequency(dset, workers=0):
 def _precompute_class_weights(dset, workers=0, mode='median-idf'):
     """
     Example:
+        >>> # xdoctest: +REQUIRES(--slow)
         >>> harn = setup_harn(0, workers=0, xpu='cpu').initialize()
         >>> dset = harn.datasets['train']
     """
@@ -604,6 +608,7 @@ def setup_harn(cmdline=True, **kw):
         xdoctest -m netharn.examples.segmentation setup_harn
 
     Example:
+        >>> # xdoctest: +REQUIRES(--slow)
         >>> kw = {'workers': 0, 'xpu': 'cpu', 'batch_size': 2}
         >>> cmdline = False
         >>> # Just sets up the harness, does not do any heavy lifting
@@ -612,7 +617,7 @@ def setup_harn(cmdline=True, **kw):
         >>> harn.initialize()
         >>> #
         >>> batch = harn._demo_batch(tag='train')
-        >>> epoch_metrics = harn._demo_epoch(tag='vali', max_iter=4)
+        >>> epoch_metrics = harn._demo_epoch(tag='vali', max_iter=2)
     """
     import sys
     import ndsampler
diff --git a/netharn/export/deployer.py b/netharn/export/deployer.py
index abb634ea262dade81f8bd067079147ef0c53f9d7..65d20a8d9e0e0a9a5f3c52218c207ed5fbfeeab2 100644
--- a/netharn/export/deployer.py
+++ b/netharn/export/deployer.py
@@ -46,6 +46,7 @@ Example:
     >>> })
     >>> harn = nh.FitHarn(hyper)
     >>> harn.preferences['use_tensorboard'] = False
+    >>> harn.preferences['timeout'] = 1
     >>> harn.intervals['test'] = 1
     >>> harn.initialize(reset='delete')
     >>> harn.run()
diff --git a/netharn/fit_harn.py b/netharn/fit_harn.py
index 817578030710640b2a3b3b25a16cfac02e5e6cb6..dfaa6c920dd38b1f85af6827c73d355d51b4fbfa 100644
--- a/netharn/fit_harn.py
+++ b/netharn/fit_harn.py
@@ -47,10 +47,10 @@ Example:
     >>>     # ================
     >>>     # Environment Components
     >>>     'workdir'     : ub.ensure_app_cache_dir('netharn/tests/demo'),
-    >>>     'nice'        : 'demo',
+    >>>     'name'        : 'demo',
     >>>     'xpu'         : nh.XPU.coerce('argv'),
     >>>     # workdir is a directory where intermediate results can be saved
-    >>>     # nice symlinks <workdir>/fit/nice/<nice> -> ../runs/<hashid>
+    >>>     # nice symlinks <workdir>/fit/nice/<name> -> ../runs/<hashid>
     >>>     # XPU auto select a gpu if idle and VRAM>6GB else a cpu
     >>>     # ================
     >>>     # Data Components
@@ -87,6 +87,7 @@ Example:
     >>> harn = nh.FitHarn(hyper)
     >>> # non-algorithmic behavior configs (do not change learned models)
     >>> harn.preferences['use_tensorboard'] = False
+    >>> harn.preferences['timeout'] = 0.5
     >>> # start training.
     >>> harn.initialize(reset='delete')
     >>> harn.run()  # note: run calls initialize it hasn't already been called.
@@ -147,23 +148,22 @@ import torch
 import numpy as np
 import ubelt as ub
 
-from netharn import hyperparams
-from netharn.exceptions import (StopTraining, CannotResume, TrainingDiverged,
-                                SkipBatch)
+import scriptconfig as scfg
 
+from netharn import hyperparams
 from netharn import util
+from netharn import export
 from netharn.util import profiler
 from netharn.util import strip_ansi
-
-from netharn import export
-
+from netharn.exceptions import (CannotResume, SkipBatch, StopTraining,
+                                TrainingDiverged)
 try:
     import tensorboard_logger
 except ImportError:
     tensorboard_logger = None
 
 
-__all__ = ['FitHarn']
+__all__ = ['FitHarn', 'FitHarnPreferences']
 
 
 # Debugging flag to run your harness in "demo mode" which only runs DEMO=5
@@ -1310,10 +1310,10 @@ class CoreMixin(object):
             if harn.preferences['prog_backend'] == 'progiter':
                 harn.info(ub.color_text('=== {} training {!r} / {!r} : {} ==='.format(
                     action, harn.epoch + 1, harn.monitor.max_epoch,
-                    harn.hyper.nice), 'white'))
+                    harn.hyper.name), 'white'))
             else:
                 harn.info(ub.color_text('=== {} training : {} ==='.format(
-                    action, harn.hyper.nice), 'white'))
+                    action, harn.hyper.name), 'white'))
 
             harn.main_prog = harn._make_prog(desc='epoch',
                                              total=harn.monitor.max_epoch,
@@ -1354,10 +1354,19 @@ class CoreMixin(object):
             ### THIS IS THE MAIN LOOP ###
             #############################
 
-            for harn.epoch in it.count(harn.epoch):
-                harn._run_tagged_epochs(train_loader, vali_loader, test_loader)
-                if DEMO and harn.epoch > DEMO:
-                    break
+            with ub.Timer() as _timer:
+                harn._timer = _timer
+                for harn.epoch in it.count(harn.epoch):
+                    harn._run_tagged_epochs(
+                        train_loader,
+                        vali_loader,
+                        test_loader
+                    )
+                    if DEMO and harn.epoch > DEMO:
+                        raise StopTraining
+                    elif _timer.toc() > harn.preferences['timeout']:
+                        harn.info('timeout')
+                        raise StopTraining
 
             ##############################
             ### THAT WAS THE MAIN LOOP ###
@@ -1369,6 +1378,14 @@ class CoreMixin(object):
             if not harn.preferences['keyboard_debug']:
                 harn.warn('\n\n\n')
                 harn.info('harn.train_dpath = {!r}'.format(harn.train_dpath))
+
+                if harn.preferences['snapshot_after_error']:
+                    harn.info('Attempting to checkpoint before crashing')
+                    harn.save_snapshot(explicit=True)
+
+                if harn.preferences['deploy_after_error']:
+                    harn.info('Attempting to deploy before crashing')
+                    harn._deploy()
                 raise
             from six.moves import input
             harn.warn('\n\n\n')
@@ -1402,6 +1419,11 @@ class CoreMixin(object):
                 elif ans == 'c':
                     harn.save_snapshot(explicit=True)
                 elif ans == 'r':
+                    # This might have issues because the referenes in this
+                    # function are still held. Likely the better way to
+                    # implement this is by handling the error gracefully and
+                    # looping within this function. Might require a
+                    # restructure.
                     return harn.run()
                 elif ans == 'e':
                     import xdev
@@ -1411,11 +1433,17 @@ class CoreMixin(object):
             raise
         except Exception as ex:
             harn.error('\n\n\n')
+            harn.info('general exception')
+            print('harn.preferences = {!r}'.format(harn.preferences))
 
             if harn.preferences['snapshot_after_error']:
                 harn.info('Attempting to checkpoint before crashing')
                 harn.save_snapshot(explicit=True)
 
+            if harn.preferences['deploy_after_error']:
+                harn.info('Attempting to deploy before crashing')
+                harn._deploy()
+
             harn.info('harn.train_dpath = {!r}'.format(harn.train_dpath))
             harn.error('an {} error occurred in the train loop: {}'.format(
                 type(ex), repr(ex)))
@@ -1454,10 +1482,9 @@ class CoreMixin(object):
             model_class = harn.hyper.model_cls
             model_params = harn.hyper.model_params
             export_modules = harn.preferences['export_modules']
-            static_modpath = export.export_model_code(harn.train_dpath,
-                                                      model_class,
-                                                      initkw=model_params,
-                                                      export_modules=export_modules)
+            static_modpath = export.export_model_code(
+                harn.train_dpath, model_class, initkw=model_params,
+                export_modules=export_modules)
             harn.info('Exported model topology to {}'.format(static_modpath))
         except Exception as ex:
             harn.warn('Failed to export model topology: {}'.format(repr(ex)))
@@ -1482,7 +1509,8 @@ class CoreMixin(object):
 
         try:
             deploy_fpath = export.DeployedModel(harn.train_dpath).package()
-            harn.info('wrote single-file deployment to: {!r}'.format(deploy_fpath))
+            harn.info('wrote single-file deployment to: {!r}'.format(
+                deploy_fpath))
 
             if True:
                 # symlink the deployed model to a static filename to make it
@@ -1610,7 +1638,7 @@ class CoreMixin(object):
             if harn.preferences['prog_backend'] == 'progiter':
                 harn.info(ub.color_text(
                     '=== finish epoch {!r} / {!r} : {} ==='.format(
-                        harn.epoch + 1, harn.monitor.max_epoch, harn.hyper.nice),
+                        harn.epoch + 1, harn.monitor.max_epoch, harn.hyper.name),
                     'white'))
 
             harn._update_main_prog_desc()
@@ -1687,6 +1715,8 @@ class CoreMixin(object):
         display_interval = harn.intervals['display_' + tag]
         is_profiling = profiler.IS_PROFILING
         use_tqdm = harn.preferences['prog_backend'] == 'tqdm'
+        timeout = harn.preferences['timeout']
+        _timer = harn._timer
 
         if isinstance(prog, ub.ProgIter):
             prog.begin()
@@ -1717,6 +1747,9 @@ class CoreMixin(object):
             for bx in range(n_batches):
                 if DEMO and bx > DEMO_BX:
                     break
+                if _timer is not None and _timer.toc() > timeout:
+                    harn.info('timeout')
+                    raise StopTraining
 
                 try:
                     raw_batch = next(batch_iter)
@@ -1747,16 +1780,18 @@ class CoreMixin(object):
                                 if np.isfinite(float(v)):
                                     loss_parts_[k] = v
                                 else:
-                                    harn.warn('Ignoring infinite loss component. Setting to large value')
+                                    harn.warn(
+                                        'Ignoring infinite loss component. '
+                                        'Setting to large value')
 
                             if not loss_parts_:
-                                raise SkipBatch('all loss components were infinite')
+                                raise SkipBatch(
+                                    'all loss components were infinite')
 
                             loss = sum(loss_parts_.values())
                         else:
                             loss = sum(loss_parts.values())
 
-                    # Backpropogate to accumulate gradients and step the optimizer
                     if learn:
                         harn.backpropogate(bx, batch, loss)
 
@@ -1791,7 +1826,7 @@ class CoreMixin(object):
                                     from netharn.mixins import _dump_monitor_tensorboard
                                     _dump_monitor_tensorboard(
                                         harn, 'iter',
-                                        harn.preferences['tensorboard_groups'])
+                                        special_groupers=harn.preferences['tensorboard_groups'])
 
                         prog.update(display_interval)
                         if use_tqdm:
@@ -1862,7 +1897,7 @@ class CoreMixin(object):
                 loss information added in this function.
         """
         loss_value = float(loss.data.cpu().item())
-        loss_value = harn._check_loss(loss_value)
+        harn._check_loss(loss_value)
 
         metrics_dict = ub.odict()
         metrics_dict['loss'] = loss_value
@@ -1922,7 +1957,6 @@ class ChecksMixin(object):
             if loss_value > harn.preferences['large_loss']:
                 # if the loss is getting large, check if the weights are ok
                 harn._check_divergence()
-        return loss_value
 
     @profiler.profile
     def _check_divergence(harn):
@@ -1956,6 +1990,20 @@ class ChecksMixin(object):
             raise TrainingDiverged(
                 'NON-FINITE WEIGHTS weights.sum() = {!r}'.format(weight_sum))
 
+    def _check_layer_rotation(harn):
+        """
+        References:
+            "Layer rotation: a surprisingly powerful indicator of generalization in deep networks?" -
+            https://arxiv.org/pdf/1806.01603.pdf
+
+        TODO:
+            - [ ] Requires storing network initialization state in memory.
+            - [ ] Per layer rotation - cosine distance
+            - [ ] Technique to combine into single number? Average? Rotation of flattened network?
+        """
+
+        pass
+
 
 @register_mixin
 class CoreCallbacks(object):
@@ -2115,6 +2163,9 @@ class CoreCallbacks(object):
     def backpropogate(harn, bx, batch, loss):
         """Custom callback which can overwrite the default backward pass
 
+        Backpropogate accumulates gradients, optionally checks and logs the
+        gradients, steps the optimizer, and zeros the gradients.
+
         Overload is generally not necessary for this function.
 
         TODO:
@@ -2135,17 +2186,39 @@ class CoreCallbacks(object):
         bstep = harn.dynamics['batch_step']
         if (bx + 1) % bstep == 0:
 
+            tag = harn.current_tag
+            iter_idx = harn.iter_index
+
             if harn.dynamics['grad_norm_max']:
                 total_norm = torch.nn.utils.clip_grad_norm_(
                     harn.model.parameters(),
                     max_norm=harn.dynamics['grad_norm_max'],
                     norm_type=harn.dynamics['grad_norm_type'],
                 )
+                if harn.preferences['log_gradients']:
+                    if harn.check_interval('log_iter_' + tag, iter_idx, first=True):
+                        harn.log_value(tag + ' iter clipped total norm', total_norm, iter_idx)
+
                 if total_norm > harn.dynamics['grad_norm_max'] * 100:
                     harn.warn('grad norm is too high: '
                               'total_norm = {!r}'.format(total_norm))
-            # if False:
-            #     harn._check_gradients(batch, loss)
+            elif harn.preferences['log_gradients']:
+                if harn.check_interval('log_iter_' + tag, iter_idx, first=True):
+                    total_norm = torch.nn.utils.clip_grad_norm_(
+                        harn.model.parameters(),
+                        max_norm=float('inf'),
+                        norm_type=harn.dynamics['grad_norm_type'],
+                    )
+                    harn.log_value(tag + ' iter total norm', total_norm, iter_idx)
+
+            if harn.preferences['log_gradients']:
+                all_grads = harn._check_gradients()
+
+                if True:
+                    layer_mag = {k: v.norm().data.cpu().numpy().tolist() for k, v in all_grads.items()}
+                    mag_arr = np.array(list(layer_mag.values()))
+                    harn.log_histogram(tag + ' iter layer norm', mag_arr, iter_idx)
+
             # harn.debug("STEP")
             harn.optimizer.step()
             harn.optimizer.zero_grad()
@@ -2217,14 +2290,14 @@ class PropertyMixin(object):
     @property
     def batch_index(harn):
         """ The index of the current batch in the current epoch """
-        return harn.bxs[harn.current_tag]
+        return harn.bxs.get(harn.current_tag, 0)
 
     @property
     def iter_index(harn):
         """ Returns the current iteration index of the current tag """
         iter_idx = (
-            harn._prev_iter_idxs[harn.current_tag] +
-            harn.bxs[harn.current_tag]
+            harn._prev_iter_idxs.get(harn.current_tag, 0) +
+            harn.bxs.get(harn.current_tag, 0)
         )
         return iter_idx
 
@@ -2307,19 +2380,21 @@ class FitHarn(ExtraMixins, InitializeMixin, ProgMixin, LogMixin, SnapshotMixin,
             starting from scratch or Pretrained if doing transfer learning)
 
         optimizer (torch.optim.optimizer.Optimizer) :
-            Optimization algorithm like SGD or ADAM. SeeAlso: `netharn.optimizers`
+            Optimization algorithm like SGD or ADAM. SeeAlso:
+                `netharn.optimizers`
 
         scheduler (torch.optim.lr_scheduler._LRScheduler) :
-            Learning rate scheduler. SeeAlso: `netharn.schedulers` for a schedulers
-            that are not currently implemented in torch. Note that the
-            newstyle-netharn schedulers can control momentum as well as lr.
+            Learning rate scheduler. SeeAlso: `netharn.schedulers` for a
+            schedulers that are not currently implemented in torch. Note that
+            the newstyle-netharn schedulers can control momentum as well as lr.
 
         criterion (torch.nn.modules.loss._Loss | None) :
             Objective function / loss criterion. SeeAlso: `netharn.criterions`.
             This is not strictly necessary if the loss is defined inline.
 
         monitor (netharn.Monitor) :
-            monitors performance of the validation set. SeeAlso `netharn.monitor`.
+            monitors performance of the validation set. SeeAlso
+            `netharn.monitor`.
 
 
     Note:
@@ -2336,8 +2411,8 @@ class FitHarn(ExtraMixins, InitializeMixin, ProgMixin, LogMixin, SnapshotMixin,
 
         if DEMO:
             # Hack to prefix the nice name in DEMO mode
-            if harn.hyper.nice is not None:
-                harn.hyper.nice = 'DEMO_' + harn.hyper.nice
+            if harn.hyper.name is not None:
+                harn.hyper.name = 'DEMO_' + harn.hyper.name
             else:
                 raise AssertionError('should have a nice name in demo mode')
 
@@ -2416,47 +2491,8 @@ class FitHarn(ExtraMixins, InitializeMixin, ProgMixin, LogMixin, SnapshotMixin,
             'cleanup': 10,
         }
 
-        # TODO: it might be interesting for preferences to have two defaults, a
-        # minimal default and a recommended default. The safe default is
-        # statically defined to the minimum requirements, and recommended could
-        # be manually or hueristically constructed.
-
-        harn.preferences = {
-            'keyboard_debug': False,
-
-            'snapshot_after_error': True,  # Try to checkpoint before crashing
-
-            'show_prog': True,
-            'use_tqdm': None,
-            'prog_backend': 'progiter',  # can be 'progiter' or 'tqdm' or 'auto'
-
-            # If your loss criterion returns a dictionary of parts, ignore any
-            # infinite values before summing the total loss.
-            'ignore_inf_loss_parts': False,
-
-            'use_tensorboard': True,
-
-            # If True, logs tensorboard within inner iteration (experimental)
-            'eager_dump_tensorboard': False,
-            'tensorboard_groups': ['loss'],  # patterns to be grouped in tensorboard
-
-            # Set this to a list of modules that the final standalone deployed
-            # zipfile should not depend on. The exporter will expand any code
-            # from these modules that are referenced by the model class.
-            'export_modules': [],
-
-            # Export the model topology by default when you initialize a harness
-            'export_on_init': True,
-
-            # A loss that would be considered large
-            # (This tells netharn when to check for divergence)
-            'large_loss': 1000,
-
-            # number of recent / best snapshots to keep
-            'num_keep': 2,
-            # Ensure we always keep a snapshot every `freq` epochs
-            'keep_freq': 20,
-        }
+        # This is only used as a dictionary.
+        harn.preferences = FitHarnPreferences(cmdline=False)
 
         # This variable should be used to store your custom script
         # configuration
@@ -2469,6 +2505,8 @@ class FitHarn(ExtraMixins, InitializeMixin, ProgMixin, LogMixin, SnapshotMixin,
         harn._log = None
         harn._tlog = None
 
+        harn._timer = None
+
     @property
     def config(harn):
         import warnings
@@ -2490,7 +2528,7 @@ class FitHarn(ExtraMixins, InitializeMixin, ProgMixin, LogMixin, SnapshotMixin,
             bool: if it is time to do something or not
         """
         n = harn.intervals[tag]
-        if n is None:
+        if n is None or n == 0:
             return False
         elif isinstance(n, int):
             # Intervals can be numbers corresponding to strides
@@ -2508,6 +2546,95 @@ class FitHarn(ExtraMixins, InitializeMixin, ProgMixin, LogMixin, SnapshotMixin,
             return (idx + start + 1) % step == 0
 
 
+class FitHarnPreferences(scfg.Config):
+    """
+    Using scriptconfig to declare defaults for netharn's preferences and
+    options. This makes it easy to extend via the commandline.
+
+    Example:
+        >>> from netharn.fit_harn import *  # NOQA
+        >>> config = FitHarnPreferences()
+        >>> config.argparse().print_help()
+    """
+    # TODO: it might be interesting for preferences to have two defaults, a
+    # minimal default and a recommended default. The safe default is
+    # statically defined to the minimum requirements, and recommended could
+    # be manually or hueristically constructed.
+    default = {
+        'keyboard_debug': scfg.Value(True, help=(
+            'Catch keyboard interupt with a somewhat-interactive prompt')
+        ),
+
+        'snapshot_after_error': scfg.Value(True, help=(
+            'Try to checkpoint before crashing')
+        ),
+
+        'deploy_after_error': scfg.Value(True, help=(
+            'Try to deploy before crashing')
+        ),
+
+        'show_prog': scfg.Value(True, help=(
+            'displays progress')
+        ),
+        'prog_backend': scfg.Value(
+            'progiter', choices=['progiter', 'tqdm', 'auto'], help=(
+                'which progress library to use')
+        ),
+
+        'ignore_inf_loss_parts': scfg.Value(False, help=(
+            'If your loss criterion returns a dictionary of parts,'
+            ' ignore any infinite values before summing the total loss.')
+        ),
+
+        'log_gradients': scfg.Value(False, help=(
+            'compute and log stats about gradients')
+        ),
+
+        'use_tensorboard': scfg.Value(True, help=(
+            'enable logging to tensorboard if available')
+        ),
+
+        'eager_dump_tensorboard': scfg.Value(True, help=(
+            'If True, logs tensorboard within inner iteration'
+            ' (experimental)')
+        ),
+
+        'tensorboard_groups': scfg.Value(['loss'], help=(
+            'patterns to be grouped in tensorboard')
+        ),
+
+        'export_modules': scfg.Value([], help=(
+            'Set this to a list of modules that the final standalone deployed'
+            ' zipfile should not depend on. The exporter will expand any code'
+            ' from these modules that are referenced by the model class.')
+        ),
+
+        'export_on_init': scfg.Value(True, help=(
+            'Export the model topology by default'
+            ' when you initialize a harness')
+        ),
+
+        'large_loss': scfg.Value(1000, help=(
+            'A loss that would be considered large '
+            '(This tells netharn when to check for divergence)')
+        ),
+
+        'num_keep': scfg.Value(2, help=(
+            'number of recent / best snapshots to keep')
+        ),
+        'keep_freq': scfg.Value(20, help=(
+            'Ensure we always keep a snapshot every `freq` epochs')
+        ),
+
+        'timeout': scfg.Value(float('inf'), help=(
+                'limits the amount of time training can take')
+        ),
+
+        # Deprecated
+        'use_tqdm': scfg.Value(None, help='deprecated'),
+    }
+
+
 if __name__ == '__main__':
     """
     CommandLine:
diff --git a/netharn/hyperparams.py b/netharn/hyperparams.py
index c6533848b2b5db4498fd35b6bc97bdc169d3a176..b723b724a778e6501c4b5610b2a3f8ef4a9cb565 100644
--- a/netharn/hyperparams.py
+++ b/netharn/hyperparams.py
@@ -62,6 +62,12 @@ from collections import OrderedDict
 # from netharn import criterions
 from torch.optim.optimizer import required
 import torch.utils.data as torch_data
+from netharn.util import util_json
+from netharn.util import util_inspect
+
+
+# backwards compatibility
+_ensure_json_serializable = util_json.ensure_json_serializable  # NOQA
 
 
 try:
@@ -75,115 +81,11 @@ def _hash_data(data):
     return ub.hash_data(data, hasher='sha512', base='abc', types=True)
 
 
-def _ensure_json_serializable(dict_, normalize_containers=False, verbose=0):
+def _rectify_class(arg, kw, lookup=None):
     """
-    Convert numpy and tuples into lists
-
-    Args:
-        normalize_containers (bool, default=False):
-            if True, normalizes dict containers to be standard python
-            structures.
+    Helps normalize and serialize hyperparameter inputs.
 
-    Example:
-        >>> from netharn.hyperparams import *  # NOQA
-        >>> from netharn.hyperparams import _hash_data, _ensure_json_serializable
-        >>> data = ub.ddict(lambda: int)
-        >>> data['foo'] = ub.ddict(lambda: int)
-        >>> data['bar'] = np.array([1, 2, 3])
-        >>> data['foo']['a'] = 1
-        >>> data['foo']['b'] = torch.FloatTensor([1, 2, 3])
-        >>> result = _ensure_json_serializable(data, normalize_containers=True)
-        >>> assert type(result) is dict
-    """
-    import copy
-    dict_ = copy.deepcopy(dict_)
-
-    def _norm_container(c):
-        if isinstance(c, dict):
-            # Cast to a normal dictionary
-            if isinstance(c, OrderedDict):
-                if type(c) is not OrderedDict:
-                    c = OrderedDict(c)
-            else:
-                if type(c) is not dict:
-                    c = dict(c)
-        return c
-
-    # inplace convert any ndarrays to lists
-    def _walk_json(data, prefix=[]):
-        items = None
-        if isinstance(data, list):
-            items = enumerate(data)
-        elif isinstance(data, tuple):
-            items = enumerate(data)
-        elif isinstance(data, dict):
-            items = data.items()
-        else:
-            raise TypeError(type(data))
-
-        root = prefix
-        level = {}
-        for key, value in items:
-            level[key] = value
-
-        # yield a dict so the user can choose to not walk down a path
-        yield root, level
-
-        for key, value in level.items():
-            if isinstance(value, (dict, list, tuple)):
-                path = prefix + [key]
-                for _ in _walk_json(value, prefix=path):
-                    yield _
-
-    def _convert(dict_, root, key, new_value):
-        d = dict_
-        for k in root:
-            d = d[k]
-        d[key] = new_value
-
-    to_convert = []
-    for root, level in ub.ProgIter(_walk_json(dict_), desc='walk json',
-                                   verbose=verbose):
-        for key, value in level.items():
-            if isinstance(value, tuple):
-                # Convert tuples on the fly so they become mutable
-                new_value = list(value)
-                _convert(dict_, root, key, new_value)
-            elif isinstance(value, np.ndarray):
-                new_value = value.tolist()
-                to_convert.append((root, key, new_value))
-            elif isinstance(value, torch.Tensor):
-                new_value = value.data.cpu().numpy().tolist()
-                to_convert.append((root, key, new_value))
-            elif isinstance(value, (np.float32, np.float64)):
-                new_value = float(value)
-                to_convert.append((root, key, new_value))
-            elif isinstance(value, (np.int32, np.int64)):
-                new_value = float(value)
-                to_convert.append((root, key, new_value))
-            elif hasattr(value, '__json__'):
-                new_value = value.__json__()
-                to_convert.append((root, key, new_value))
-            elif normalize_containers:
-                if isinstance(value, dict):
-                    new_value = _norm_container(value)
-                    to_convert.append((root, key, new_value))
-
-    for root, key, new_value in to_convert:
-        _convert(dict_, root, key, new_value)
-
-    if normalize_containers:
-        # normalize the outer layer
-        dict_ = _norm_container(dict_)
-    return dict_
-
-
-def _rectify_class(lookup, arg, kw):
-    """
     Args:
-        lookup (func | None):
-            transforms arg or arg[0] into the class type
-
         arg (Tuple[type, dict] | type | object):
             Either a (cls, initkw) tuple, a class, or an instance.
             It is recommended that you don't pass an instance.
@@ -191,10 +93,32 @@ def _rectify_class(lookup, arg, kw):
         kw (Dict[str, object]):
             augments initkw if arg is in tuple form otherwise becomes initkw
 
+        lookup (func | None):
+            transforms arg or arg[0] into the class type
+
     Returns:
-        Tuple[type, Dict]:
-            The class type that we want to construct and the keyword args
-            used to do the construction.
+        Dict: containing
+            'cls' (type): the type of the object
+            'cls_kw' (Dict): the initialization keyword args
+            'instance': (object): None or the actual instanciated object
+
+            We will use this cls and cls_kw to construct an instance unless one
+            is already specified.
+
+    Example:
+        >>> # The ideal case is that we have a cls, initkw tuple
+        >>> import netharn as nh
+        >>> kw = {'lr': 0.1}
+        >>> cls = torch.optim.SGD
+        >>> rectified1 = _rectify_class(cls, kw.copy())
+        >>> print('rectified1 = {!r}'.format(rectified1))
+        >>> # But we can also take an instance of the object, however, you must
+        >>> # now make sure to specify the _initkw attribute.
+        >>> model = nh.models.ToyNet2d()
+        >>> self = cls(model.parameters(), **kw)
+        >>> self._initkw = kw
+        >>> rectified2 = _rectify_class(self, {})
+        >>> print('rectified2 = {!r}'.format(rectified2))
     """
     if lookup is None:
         lookup = ub.identity
@@ -225,7 +149,7 @@ def _rectify_class(lookup, arg, kw):
             # We were passed an actual instance of the class. (for shame)
             instance = cls_key
 
-        cls_kw = _class_default_params(cls).copy()
+        cls_kw = util_inspect.default_kwargs(cls).copy()
 
         if instance is not None:
             # Try and introspect the initkw, which is needed for model
@@ -239,15 +163,15 @@ def _rectify_class(lookup, arg, kw):
                 cls_kw.update(instance._initkw)
             else:
                 import warnings
-                warnings.warn(ub.paragraph(
+                warnings.warn(ub.paragraph(  # _initkw warning
                     '''
-                    Netharn expects hyperparameter objects to be specified as
+                    netharn.HyperParams objects are expected to be specified as
                     (type, kw) tuples, but we received a preconstructed
                     instance. This is only ok if you know what you are doing.
                     To disable this warning set the _initkw instance attribute
                     to the correct keyword arguments needed to reconstruct this
-                    class.
-                    '''))
+                    class. Offending data is arg={!r}, kw={!r}
+                    ''').format(arg, kw))
 
         # Update with explicitly specified information
         cls_kw.update(kw2)
@@ -255,55 +179,19 @@ def _rectify_class(lookup, arg, kw):
             if key in kw:
                 cls_kw[key] = kw.pop(key)
 
-    cls_kw = _ensure_json_serializable(cls_kw)
+    cls_kw = util_json.ensure_json_serializable(cls_kw)
     rectified = {
         'cls': cls,
         'cls_kw': cls_kw,
         'instance': instance,
     }
     return rectified
-    # return cls, cls_kw
-
-
-def _class_default_params(cls):
-    """
-    Grab initkw defaults from the constructor
-
-    CommandLine:
-        xdoctest -m netharn.hyperparams _class_default_params
-
-    Doctest:
-        >>> cls = torch.optim.Adam
-        >>> _class_default_params(cls)
-        >>> cls = initializers.KaimingNormal
-        >>> print(ub.repr2(_class_default_params(cls), nl=0))
-        {'mode': 'fan_in', 'param': 0}
-        >>> cls = initializers.NoOp
-        >>> _class_default_params(cls)
-        {}
-    """
-    if six.PY2:
-        if cls.__init__ is object.__init__:
-            # hack for python2 classes without __init__
-            return {}
-        else:
-            import funcsigs
-            sig = funcsigs.signature(cls)
-    else:
-        import inspect
-        sig = inspect.signature(cls)
-    default_params = {
-        k: p.default
-        for k, p in sig.parameters.items()
-        if p.default is not p.empty
-    }
-    return default_params
 
 
 def _rectify_criterion(arg, kw):
     if arg is None:
         # arg = 'CrossEntropyLoss'
-        return _rectify_class(None, None, kw)
+        return _rectify_class(None, kw)
 
     def _lookup(arg):
         if isinstance(arg, six.string_types):
@@ -316,11 +204,37 @@ def _rectify_criterion(arg, kw):
             cls = arg
         return cls
 
-    rectified = _rectify_class(_lookup, arg, kw)
+    rectified = _rectify_class(arg, kw, _lookup)
     return rectified
 
 
 def _rectify_optimizer(arg, kw):
+    """
+    Create a rectified tuple
+
+    Example:
+        >>> # Test using a (cls, kw) tuple and an instance object.
+        >>> import netharn as nh
+        >>> optim_ = nh.api.Optimizer.coerce({
+        >>>     'optim': 'adam', 'lr': 0.1, 'weight_decay': 1e-4})
+        >>> cls, kw = optim_
+        >>> #
+        >>> model = nh.models.ToyNet2d()
+        >>> params = dict(model.named_parameters())
+        >>> grouped_keys = {}
+        >>> grouped_keys['bias'] = [k for k in params.keys() if 'bias' in k]
+        >>> grouped_keys['weight'] = [k for k in params.keys() if 'weight' in k]
+        >>> named_param_groups = {
+        >>>     k: {'params': list(ub.take(params, sorted(v)))}
+        >>>     for k, v in grouped_keys.items()
+        >>> }
+        >>> named_param_groups['bias']['weight_decay'] = 0
+        >>> param_groups = list(ub.sorted_keys(named_param_groups).values())
+        >>> #
+        >>> optim = cls(param_groups, **kw)
+        >>> rectified1 = _rectify_optimizer(cls, kw)
+        >>> rectified2 = _rectify_optimizer(optim, {})
+    """
     if arg is None:
         arg = 'SGD'
         if kw is None:
@@ -340,7 +254,7 @@ def _rectify_optimizer(arg, kw):
             cls = arg
         return cls
 
-    rectified = _rectify_class(_lookup, arg, kw)
+    rectified = _rectify_class(arg, kw, _lookup)
     kw2 = rectified['cls_kw']
 
     for k, v in kw2.items():
@@ -352,7 +266,7 @@ def _rectify_optimizer(arg, kw):
 
 def _rectify_lr_scheduler(arg, kw):
     if arg is None:
-        return _rectify_class(None, None, kw)
+        return _rectify_class(None, kw)
 
     def _lookup(arg):
         if isinstance(arg, six.string_types):
@@ -368,7 +282,7 @@ def _rectify_lr_scheduler(arg, kw):
             cls = arg
         return cls
 
-    rectified = _rectify_class(_lookup, arg, kw)
+    rectified = _rectify_class(arg, kw, _lookup)
     return rectified
 
 
@@ -389,7 +303,7 @@ def _rectify_initializer(arg, kw):
             cls = arg
         return cls
 
-    rectified = _rectify_class(_lookup, arg, kw)
+    rectified = _rectify_class(arg, kw, _lookup)
     return rectified
 
 
@@ -401,7 +315,7 @@ def _rectify_monitor(arg, kw):
         else:
             cls = arg
         return cls
-    rectified = _rectify_class(_lookup, arg, kw)
+    rectified = _rectify_class(arg, kw, _lookup)
     return rectified
 
 
@@ -432,7 +346,7 @@ def _rectify_dynamics(arg, kw):
 
 def _rectify_model(arg, kw):
     if arg is None:
-        return _rectify_class(None, None, kw)
+        return _rectify_class(None, kw)
 
     def _lookup_model(arg):
         import torchvision
@@ -450,7 +364,7 @@ def _rectify_model(arg, kw):
     if isinstance(arg, device.MountedModel):
         arg = arg.module
 
-    rectified = _rectify_class(_lookup_model, arg, kw)
+    rectified = _rectify_class(arg, kw, _lookup_model)
     return rectified
 
 
@@ -523,7 +437,7 @@ class HyperParams(object):
     def __init__(hyper,
                  # ----
                  datasets=None,
-                 nice=None,
+                 name=None,
                  workdir=None,
                  xpu=None,
                  loaders=None,
@@ -539,11 +453,16 @@ class HyperParams(object):
                  augment=None,
                  other=None,  # incorporated into the hash
                  extra=None,  # ignored when computing the hash
+                 nice=None,  # alias of name
                  ):
         kwargs = {}
 
         hyper.datasets = datasets
-        hyper.nice = nice
+        if name is None:
+            import warnings
+            warnings.warn('Specify "name" instead of "nice"')
+            name = nice
+        hyper.name = name
         hyper.workdir = workdir
         hyper.xpu = xpu
 
@@ -583,6 +502,11 @@ class HyperParams(object):
         hyper.other = other
         hyper.extra = extra
 
+    @property
+    def nice(hyper):
+        """ alias of name for backwards compatibility """
+        return hyper.name
+
     def make_model(hyper):
         """ Instanciate the model defined by the hyperparams """
         if hyper._model_info['instance'] is not None:
@@ -947,21 +871,21 @@ class HyperParams(object):
         """
         train_hashid = _hash_data(train_id)[0:8]
 
-        nice = hyper.nice
+        name = hyper.name
 
         nice_dpath = None
         if not given_explicit_train_dpath:
             # setup a cannonical and a linked symlink dir
             train_dpath = normpath(
-                    join(hyper.workdir, 'fit', 'runs', nice, train_hashid))
+                    join(hyper.workdir, 'fit', 'runs', name, train_hashid))
             # also setup a "nice" custom name, which may conflict, but oh well
-            if nice:
+            if name:
                 try:
                     nice_dpath = normpath(
-                            join(hyper.workdir, 'fit', 'nice', nice))
+                            join(hyper.workdir, 'fit', 'nice', name))
                 except Exception:
                     print('hyper.workdir = {!r}'.format(hyper.workdir))
-                    print('hyper.nice = {!r}'.format(hyper.nice))
+                    print('hyper.name = {!r}'.format(hyper.name))
                     raise
 
         # make temporary initializer so we can infer the history
@@ -989,7 +913,7 @@ class HyperParams(object):
             ('init_history', init_history),
             ('init_history_hashid', _hash_data(util.make_idstr(init_history))),
 
-            ('nice', hyper.nice),
+            ('nice', hyper.name),
 
             ('old_train_dpath', normpath(
                 join(hyper.workdir, 'fit', 'runs', train_hashid))),
@@ -1023,7 +947,7 @@ class HyperParams(object):
             # ================
             # Environment Components
             'workdir'     : ub.ensure_app_cache_dir('netharn/tests/demo'),
-            'nice'        : 'demo',
+            'name'        : 'demo',
             'xpu'         : nh.XPU.coerce('argv'),
             # workdir is a directory where intermediate results can be saved
             # nice symlinks <workdir>/fit/nice/<nice> -> ../runs/<hashid>
diff --git a/netharn/layers/attention.py b/netharn/layers/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5943b6b725ed8989562f6946f76ddb85d197408
--- /dev/null
+++ b/netharn/layers/attention.py
@@ -0,0 +1,189 @@
+"""
+References:
+    https://arxiv.org/pdf/1809.02983.pdf - Dual Attention Network for Scene Segmentation
+
+    https://raw.githubusercontent.com/heykeetae/Self-Attention-GAN/master/sagan_models.py
+"""
+import torch
+from torch import nn
+
+
+class SelfAttention(nn.Module):
+    """
+    Self Attention Layer
+
+    References:
+    """
+
+    def __init__(self, in_channels):
+        super(SelfAttention, self).__init__()
+        self.chanel_in = in_channels
+
+        self.query_conv = nn.Conv2d(in_channels=in_channels,
+                                    out_channels=in_channels // 8,
+                                    kernel_size=1)
+
+        self.key_conv = nn.Conv2d(in_channels=in_channels,
+                                  out_channels=in_channels // 8, kernel_size=1)
+
+        self.value_conv = nn.Conv2d(in_channels=in_channels,
+                                    out_channels=in_channels, kernel_size=1)
+
+        self.gamma = nn.Parameter(torch.zeros(1))
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x):
+        """
+        Args :
+            x (Tensor): input feature maps (B x C x W x H)
+
+        Returns :
+            out : self attention value + input feature
+            attention: B x N x N (N is Width*Height)
+        """
+        B, C, W, H = x.shape
+        N = W * H
+
+        proj_query = self.query_conv(x).view(B, -1, N).permute(0, 2, 1)  # B x C x(N)
+
+        proj_key = self.key_conv(x).view(B, -1, N)  # B x C x (*W*H)
+
+        energy = torch.bmm(proj_query, proj_key)  # transpose check
+
+        attention = self.softmax(energy)  # B x (N) x (N)
+
+        proj_value = self.value_conv(x).view(B, -1, N)  # B x C x N
+
+        out = torch.bmm(proj_value, attention.permute(0, 2, 1))
+        out = out.view(B, C, W, H)
+
+        out = self.gamma * out + x
+        return out, attention
+
+
+class ChannelAttention(nn.Module):
+    """
+    Channel attention module
+
+    The channel attention module selectively emphasizes interdependent channel
+    maps by integrating associated features among all channel map.
+
+    Uses the uncentered scatter matrix (i.e. M @ M.T) to compute a unnormalized
+    correlation-like matrix between channels.
+
+    I think M @ M.T is an "uncentered scatter matrix"
+
+    https://stats.stackexchange.com/questions/164997/relationship-between-gram-and-covariance-matrices
+
+    not sure if this is the right term
+
+    References:
+        https://arxiv.org/pdf/1809.02983.pdf - Dual Attention Network for Scene Segmentation
+        https://github.com/junfu1115/DANet/blob/master/encoding/nn/attention.py
+
+    Notes:
+         Different from the position attention module, we directly calculate
+         the channel attention map from the original features.
+
+         Noted that we do not employ convolution layers to embed features
+         before computing relationshoips of two channels, since it can maintain
+         relationship between different channel maps. In addition, different
+         from recent works [Zhang CVPR 2018 Context encoding for semantic
+         segmentation] which explores channel relationships by a global pooling
+         or encoding layer, we exploit spatial information at all corresponding
+         positions to model channel correlations
+
+    Ignore:
+
+        >>> # Simple example to demonstrate why a multiplicative parameter
+        >>> # at zero might or might not deviate to decrease the loss
+        >>> x = torch.randn(10)
+        >>> x[0] = -1000
+        >>> p = nn.Parameter(torch.zeros(1) + 1e-1)
+        >>> optim = torch.optim.SGD([p], lr=1e-1)
+        >>> for i in range(10):
+        >>>     loss = (x * (p ** 2)).sum()
+        >>>     loss.backward()
+        >>>     print('loss = {!r}'.format(loss))
+        >>>     print('p.data = {!r}'.format(p.data))
+        >>>     print('p.grad = {!r}'.format(p.grad))
+        >>>     optim.step()
+        >>>     optim.zero_grad()
+
+        >>> # at zero might or might not deviate to decrease the loss
+        >>> x = torch.randn(2)
+        >>> x[0] = -1000
+        >>> p = nn.Parameter(torch.zeros(1))
+        >>> optim = torch.optim.SGD([p], lr=1e-1)
+        >>> for i in range(10):
+        >>>     loss = (x * p.clamp(0, None)).sum()
+        >>>     loss.backward()
+        >>>     print('loss = {!r}'.format(loss))
+        >>>     print('p.data = {!r}'.format(p.data))
+        >>>     print('p.grad = {!r}'.format(p.grad))
+        >>>     optim.step()
+        >>>     optim.zero_grad()
+
+    Ignore:
+        >>> B, C, H, W = 1, 3, 5, 7
+        >>> inputs = torch.rand(B, C, H, W)
+        >>> inputs = torch.arange(B * C * H * W).view(B, C, H, W).float()
+        >>> self = ChannelAttention(C)
+        >>> optim = torch.optim.SGD(self.parameters(), lr=1e-8)
+        >>> for i in range(10):
+        >>>     out = self(inputs)
+        >>>     loss = (out.sum() ** 2)
+        >>>     print('self.gamma = {!r}'.format(self.gamma))
+        >>>     print('loss = {!r}'.format(loss))
+        >>>     loss.backward()
+        >>>     optim.step()
+        >>>     optim.zero_grad()
+    """
+    def __init__(self, in_channels, attend_elsewhere=True):
+        super(ChannelAttention, self).__init__()
+        self.in_channels = in_channels
+
+        # hack to rectify the definiton in the paper with the implementaiton
+        self.attend_elsewhere = attend_elsewhere
+
+        # scale parameter (beta from paper)
+        self.gamma = nn.Parameter(torch.zeros(1))
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (Tensor): input feature maps (B, C, H, W)
+
+        Returns:
+            out (Tensor): attention value + input feature
+            attention: (B, C, C)
+
+        Example:
+            >>> B, C, H, W = 1, 3, 5, 7
+            >>> inputs = torch.rand(B, C, H, W)
+            >>> self = ChannelAttention(C)
+        """
+        B, C, H, W = inputs.shape
+
+        # Flatten spatial dims
+        proj_query = inputs.view(B, C, -1)  # A
+        proj_key = inputs.view(B, C, -1).permute(0, 2, 1)  # A.T
+        proj_value = inputs.view(B, C, -1)  # A
+
+        energy = torch.bmm(proj_query, proj_key)  # A @ A.T
+
+        if self.attend_elsewhere:
+            # Why the subtraction here?
+            diag = torch.max(energy, dim=1, keepdim=True)[0].expand_as(energy)
+            energy_new = diag - energy
+            attention = energy_new.softmax(dim=1)
+        else:
+            attention = energy.softmax(dim=1)
+
+        out = torch.bmm(attention, proj_value)
+        out = out.view(B, C, H, W)
+
+        residual = self.gamma * out
+        out = residual + inputs
+        return out
diff --git a/netharn/layers/common.py b/netharn/layers/common.py
index f07e21b2d972a90c0825b4d2ec3f2953a3c4dd33..0c3473c0dbb14be1a3a7951aa94da27e68cad2b5 100644
--- a/netharn/layers/common.py
+++ b/netharn/layers/common.py
@@ -43,11 +43,11 @@ class Sequential(nn.Sequential, util.ModuleMixin):
         }
     """
     def output_shape_for(self, input_shape):
-        from netharn import output_shape_for
+        from netharn.analytic import output_shape_for
         return output_shape_for.OutputShapeFor.sequential(self, input_shape)
 
     def receptive_field_for(self, input_field=None):
-        from netharn import receptive_field_for
+        from netharn.analytic import receptive_field_for
         return receptive_field_for.ReceptiveFieldFor.sequential(self, input_field)
 
 
@@ -70,7 +70,7 @@ class Identity(Sequential):
 
     def receptive_field_for(self, input_field=None):
         if input_field is None:
-            from netharn import receptive_field_for
+            from netharn.analytic import receptive_field_for
             input_field = receptive_field_for.ReceptiveFieldFor.input()
         return input_field
 
@@ -95,7 +95,7 @@ class AnalyticModule(Module):
 
     @classmethod
     def _analytic_shape_kw(self):
-        from netharn import output_shape_for
+        from netharn.analytic import output_shape_for
         return {
             '_OutputFor': output_shape_for.OutputShapeFor,
             '_Output': output_shape_for.OutputShape,
@@ -104,7 +104,7 @@ class AnalyticModule(Module):
 
     @classmethod
     def _analytic_field_kw(self):
-        from netharn import receptive_field_for
+        from netharn.analytic import receptive_field_for
         # import netharn as nh
         return {
             '_OutputFor': receptive_field_for.ReceptiveFieldFor,
@@ -115,36 +115,45 @@ class AnalyticModule(Module):
     @classmethod
     def _analytic_forward_kw(self):
         # import netharn as nh
-        from netharn import analytic_for
+        from netharn.analytic import analytic_for
         return {
             '_OutputFor': analytic_for.ForwardFor,
             '_Output': analytic_for.Output,
             '_Hidden': analytic_for.Hidden,
         }
 
-    def output_shape_for(self, input_shape):
+    def output_shape_for(self, input_shape, **kwargs):
         """
         Uses custom _analytic_forward to compute output shape
         """
         kw = self._analytic_shape_kw()
+        if kwargs:
+            kw = kw.copy()
+            kw.update(kwargs)
         return self._analytic_forward(input_shape, **kw)
 
-    def receptive_field_for(self, input_field=None):
+    def receptive_field_for(self, input_field=None, **kwargs):
         """
         Uses custom _analytic_forward to compute receptive field
         """
         # import netharn as nh
-        from netharn import receptive_field_for
+        from netharn.analytic import receptive_field_for
         if input_field is None:
             input_field = receptive_field_for.ReceptiveFieldFor.input()
         kw = self._analytic_field_kw()
+        if kwargs:
+            kw = kw.copy()
+            kw.update(kwargs)
         return self._analytic_forward(input_field, **kw)
 
-    def forward(self, inputs, **kw):
+    def forward(self, inputs, **kwargs):
         """
         Uses custom _analytic_forward to compute receptive field
         """
         kw = self._analytic_forward_kw()
+        if kwargs:
+            kw = kw.copy()
+            kw.update(kwargs)
         return self._analytic_forward(inputs, **kw)
 
 
diff --git a/netharn/layers/conv_norm.py b/netharn/layers/conv_norm.py
index b588f59729762f4536ef9ea3ab1023fd5f70f8f5..f4385be8334b9171fa03232953b1adb5a5873f3f 100644
--- a/netharn/layers/conv_norm.py
+++ b/netharn/layers/conv_norm.py
@@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, division, print_function, unicode_literals
-from netharn.output_shape_for import OutputShapeFor
+from netharn.analytic.output_shape_for import OutputShapeFor
 from netharn.layers import rectify
 from netharn.layers import common
 import ubelt as ub  # NOQA
diff --git a/netharn/layers/mish.py b/netharn/layers/mish.py
new file mode 100644
index 0000000000000000000000000000000000000000..71c1e24d715de307e0785e462985f34b0f2ecaf9
--- /dev/null
+++ b/netharn/layers/mish.py
@@ -0,0 +1,110 @@
+from torch import nn
+import torch
+import torch.nn.functional as F
+
+
+@torch.jit.script
+def mish(input):
+    """
+    Applies the mish function element-wise:
+    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+    See additional documentation for mish class.
+    """
+    return input * torch.tanh(F.softplus(input))
+
+
+def beta_mish(input, beta=1.5):
+    """
+    Applies the β mish function element-wise:
+        .. math::
+            \\beta mish(x) = x * tanh(ln((1 + e^{x})^{\\beta}))
+    See additional documentation for :mod:`echoAI.Activation.Torch.beta_mish`.
+
+    References:
+        https://github.com/digantamisra98/Echo/blob/master/echoAI/Activation/Torch/functional.py
+    """
+    return input * torch.tanh(torch.log(torch.pow((1 + torch.exp(input)), beta)))
+
+
+class Mish_Function(torch.autograd.Function):
+
+    """
+    Applies the mish function element-wise:
+    .. math::
+        mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + e^{x}))
+    Plot:
+    .. figure::  _static/mish.png
+        :align:   center
+    Shape:
+        - Input: (N, *) where * means, any number of additional
+          dimensions
+        - Output: (N, *), same shape as the input
+
+    References:
+        https://github.com/digantamisra98/Echo/blob/master/echoAI/Activation/Torch/mish.py
+
+    Examples:
+        >>> m = Mish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        y = x * torch.tanh(F.softplus(x))  # x * tanh(ln(1 + exp(x)))
+        return y
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_variables[0]
+        sigmoid = torch.sigmoid(x)
+        tanh_sp = torch.tanh(F.softplus(x))
+        return grad_output * (tanh_sp + x * sigmoid * (1 - tanh_sp * tanh_sp))
+
+    # else:
+    #     @torch.jit.script
+    #     def mish(input):
+    #         delta = torch.exp(-input)
+    #         alpha = 1 + 2 * delta
+    #         return input * alpha / (alpha + 2 * delta * delta)
+
+
+class Mish(nn.Module):
+    """
+    Applies the mish function element-wise:
+    mish(x) = x * tanh(softplus(x)) = x * tanh(ln(1 + exp(x)))
+
+    Shape:
+        - Input: (N, *) where * means, any number of additional
+          dimensions
+        - Output: (N, *), same shape as the input
+
+    References:
+        https://github.com/digantamisra98/Mish/blob/master/Mish/Torch/mish.py
+        https://github.com/thomasbrandon/mish-cuda
+        https://arxiv.org/pdf/1908.08681v2.pdf
+
+    Examples:
+        >>> m = Mish()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+
+    Example:
+        >>> x = torch.linspace(-20, 20, 100, requires_grad=True)
+        >>> self = Mish()
+        >>> y = self(x)
+        >>> y.sum().backward()
+        >>> # xdoctest: +REQUIRES(--show)
+        >>> import kwplot
+        >>> kwplot.autompl()
+        >>> kwplot.multi_plot(xydata={'beta=1': (x.data, y.data)}, fnum=1, pnum=(1, 2, 1))
+        >>> kwplot.multi_plot(xydata={'beta=1': (x.data, x.grad)}, fnum=1, pnum=(1, 2, 2))
+        >>> kwplot.show_if_requested()
+    """
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input):
+        return Mish_Function.apply(input)
+        # return mish(input)
diff --git a/netharn/layers/norm.py b/netharn/layers/norm.py
index 855f9ee2d7e930ae6ca321f49ecbeb36c747e79a..8aaa267bc1b2c09a9c78ef02392751b40f821408 100644
--- a/netharn/layers/norm.py
+++ b/netharn/layers/norm.py
@@ -3,8 +3,8 @@ import torch
 from torch import nn
 import ubelt as ub
 from netharn.layers import common
-from netharn import output_shape_for
-from netharn import receptive_field_for
+from netharn.analytic import output_shape_for
+from netharn.analytic import receptive_field_for
 
 
 class L2Norm(common.Module):
@@ -35,7 +35,7 @@ class L2Norm(common.Module):
                    [20., 20.]]], dtype=np.float32)
 
     Example:
-        >>> from netharn.output_shape_for import OutputShapeFor
+        >>> from netharn.analytic.output_shape_for import OutputShapeFor
         >>> self = L2Norm(in_features=7, scale=20)
         >>> OutputShapeFor(self)._check_consistency((1, 7, 2, 2))
         (1, 7, 2, 2)
diff --git a/netharn/layers/rectify.py b/netharn/layers/rectify.py
index ba86ae0687242611eb9b686bb996956519011a76..6307c3f2f3d71a9a46b1f6920ac9eb040de8be93 100644
--- a/netharn/layers/rectify.py
+++ b/netharn/layers/rectify.py
@@ -53,6 +53,10 @@ def rectify_nonlinearity(key=ub.NoParam, dim=2):
         from netharn.layers.swish import Swish
         kw.pop('inplace', None)
         cls = Swish
+    elif noli_type == 'mish':
+        from netharn.layers.mish import Mish
+        kw.pop('inplace', None)
+        cls = Mish
     else:
         raise KeyError('unknown type: {}'.format(kw))
     return cls(**kw)
diff --git a/netharn/layers/reshape.py b/netharn/layers/reshape.py
index 6a8b45bc88364b029774fdbb6f8e7552e5874160..26b2e3b92e154f18cf96a58e3c7a2a69e006c2f5 100644
--- a/netharn/layers/reshape.py
+++ b/netharn/layers/reshape.py
@@ -1,6 +1,6 @@
 import torch
 from netharn import util
-from netharn import output_shape_for
+from netharn.analytic import output_shape_for
 
 
 class Reshape(torch.nn.Module, util.ModuleMixin):
diff --git a/netharn/layers/swish.py b/netharn/layers/swish.py
index bc04b1278e6fd507c4d33922b3692a34f4b1dcf2..33905d244014e40bebcc5601204b1f00f14e82f4 100644
--- a/netharn/layers/swish.py
+++ b/netharn/layers/swish.py
@@ -41,7 +41,7 @@ class Swish(nn.Module):
         >>> kwplot.autompl()
         >>> kwplot.multi_plot(xydata={'beta=1': (x.data, y.data)}, fnum=1, pnum=(1, 2, 1))
         >>> kwplot.multi_plot(xydata={'beta=1': (x.data, x.grad)}, fnum=1, pnum=(1, 2, 2))
-        >>> kwplot.show_if_requestd()
+        >>> kwplot.show_if_requested()
 
     """
     def __init__(self, beta=1.0):
diff --git a/netharn/metrics/assignment.py b/netharn/metrics/assignment.py
index e0511bc0107d31c2f4875415a244d95c77abdcef..7add77e1f8a325f2d0e8b3c36d713fc1f67e849a 100644
--- a/netharn/metrics/assignment.py
+++ b/netharn/metrics/assignment.py
@@ -526,6 +526,8 @@ def _filter_ignore_regions(true_dets, pred_dets, ovthresh=0.5,
             detections should be ignored.
 
     Example:
+        >>> from netharn.metrics.assignment import *  # NOQA
+        >>> from netharn.metrics.assignment import _filter_ignore_regions
         >>> import kwimage
         >>> pred_dets = kwimage.Detections.random(classes=['a'])
         >>> true_dets = kwimage.Detections.random(
@@ -538,10 +540,30 @@ def _filter_ignore_regions(true_dets, pred_dets, ovthresh=0.5,
         >>>     true_dets, pred_dets, ovthresh=ovthresh, ignore_class=ignore_class)
         >>> print('flags1 = {!r}'.format(flags1))
         >>> print('flags2 = {!r}'.format(flags2))
+
+
+        >>> flags3, flags4 = _filter_ignore_regions(
+        >>>     true_dets, pred_dets, ovthresh=ovthresh,
+        >>>     ignore_class=ignore_class.upper())
+        >>> assert np.all(flags1 == flags3)
+        >>> assert np.all(flags2 == flags4)
     """
     true_ignore_flags = np.zeros(len(true_dets), dtype=np.bool)
     pred_ignore_flags = np.zeros(len(pred_dets), dtype=np.bool)
 
+    def _normalize_catname(name, classes):
+        if classes is None:
+            return name
+        if name in classes:
+            return name
+        for cname in classes:
+            if cname.lower() == name.lower():
+                return cname
+        return name
+        # raise KeyError(name)
+
+    ignore_class = _normalize_catname(ignore_class, true_dets.classes)
+
     # Filter out true detections labeled as "ignore"
     if true_dets.classes is not None and ignore_class in true_dets.classes:
         ignore_cidx = true_dets.classes.index(ignore_class)
@@ -556,8 +578,13 @@ def _filter_ignore_regions(true_dets, pred_dets, ovthresh=0.5,
 
             # Determine which predicted boxes are inside the ignore regions
             # note: using sum over max is delibrate here.
-            ignore_overlap = (pred_boxes.isect_area(ignore_boxes) /
-                              pred_boxes.area).clip(0, 1).sum(axis=1)
+            with warnings.catch_warnings():
+                warnings.filterwarnings('ignore', message='invalid .* less')
+                warnings.filterwarnings('ignore', message='invalid .* greater_equal')
+                warnings.filterwarnings('ignore', message='invalid .* true_divide')
+                ignore_overlap = (pred_boxes.isect_area(ignore_boxes) /
+                                  pred_boxes.area).clip(0, 1).sum(axis=1)
+                ignore_overlap = np.nan_to_num(ignore_overlap)
 
             ignore_idxs = np.where(ignore_overlap > ovthresh)[0]
 
@@ -584,7 +611,6 @@ def _filter_ignore_regions(true_dets, pred_dets, ovthresh=0.5,
                         overlap = (isect.area / pred_region.area)
                         ignore_overlap[idx] = overlap
                     except Exception as ex:
-                        import warnings
                         warnings.warn('ex = {!r}'.format(ex))
             pred_ignore_flags = ignore_overlap > ovthresh
     return true_ignore_flags, pred_ignore_flags
diff --git a/netharn/metrics/clf_report.py b/netharn/metrics/clf_report.py
index cff4a4eec62e3e8eb99118e3bbff879031b54140..d8f41e963896a86bbf2c143bdb7959b9c7ef803d 100644
--- a/netharn/metrics/clf_report.py
+++ b/netharn/metrics/clf_report.py
@@ -377,6 +377,15 @@ def ovr_classification_report(mc_y_true, mc_probs, target_names=None,
         1 0.5846 0.6014 0.0000 0.0000 0.2195        5  0.2778
         2 0.8000 0.8693 0.2623 0.2652 0.1602        5  0.2778
 
+    Ignore:
+        >>> y_true = [1, 1, 1]
+        >>> y_probs = np.random.rand(len(y_true), 3)
+        >>> target_names = None
+        >>> sample_weight = None
+        >>> verbose = True
+        >>> report = ovr_classification_report(y_true, y_probs)
+        >>> print(report['ovr'])
+
     """
     import pandas as pd
     import sklearn.metrics
@@ -415,8 +424,11 @@ def ovr_classification_report(mc_y_true, mc_probs, target_names=None,
             true_probs = (bin_probs * bin_truth).sum(axis=1)
 
             if 'auc' in metrics:
-                k_metrics['auc'] = sklearn.metrics.roc_auc_score(
-                    bin_truth, bin_probs, sample_weight=sample_weight)
+                try:
+                    k_metrics['auc'] = sklearn.metrics.roc_auc_score(
+                        bin_truth, bin_probs, sample_weight=sample_weight)
+                except ValueError:
+                    k_metrics['auc'] = np.nan
 
             if 'ap' in metrics:
                 k_metrics['ap'] = sklearn.metrics.average_precision_score(
diff --git a/netharn/metrics/confusion_vectors.py b/netharn/metrics/confusion_vectors.py
index b0b428c3839c37d721d43e15007254e411f8b1c4..fc3f89d6eca80e050c56c22a14622ee4ed2c83b2 100644
--- a/netharn/metrics/confusion_vectors.py
+++ b/netharn/metrics/confusion_vectors.py
@@ -604,7 +604,7 @@ class BinaryConfusionVectors(ub.NiceRepr):
         return kwplot.multi_plot(xdata=xdata, ydata=ydata, color=color)
 
     # @ub.memoize_method
-    def precision_recall(self, stabalize_thresh=7, stabalize_pad=7):
+    def precision_recall(self, stabalize_thresh=7, stabalize_pad=7, method='sklearn'):
         """
         Example:
             >>> self = BinaryConfusionVectors.demo(n=11)
@@ -637,8 +637,8 @@ class BinaryConfusionVectors(ub.NiceRepr):
             from sklearn.metrics._ranking import _binary_clf_curve
         except ImportError:
             from sklearn.metrics.ranking import _binary_clf_curve
-        data = self.data
 
+        data = self.data
         y_true = data['is_true'].astype(np.uint8)
         y_score = data['pred_score']
         sample_weight = data._data.get('weight', None)
@@ -649,9 +649,13 @@ class BinaryConfusionVectors(ub.NiceRepr):
             prec = [np.nan]
             rec = [np.nan]
             fps = [np.nan]
+            fns = [np.nan]
             tps = [np.nan]
             thresholds = [np.nan]
 
+            realpos_total = 0
+            realneg_total = 0
+            nsupport = 0
         else:
             if len(self) <= stabalize_thresh:
                 # add dummy data to stabalize the computation
@@ -661,24 +665,72 @@ class BinaryConfusionVectors(ub.NiceRepr):
                 y_true, y_score, sample_weight = _stabalilze_data(
                     y_true, y_score, sample_weight, npad=npad)
 
-            metric_kw = {
-                'y_true': y_true,
-                'sample_weight': sample_weight,
-            }
+            # Get the total weight (typically number of) positive and negative
+            # examples of this class
+            if sample_weight is None:
+                weight = 1
+                nsupport = len(y_true) - bool(npad)
+            else:
+                weight = sample_weight
+                nsupport = sample_weight.sum() - bool(npad)
+
+            realpos_total = (y_true * weight).sum()
+            realneg_total = ((1 - y_true) * weight).sum()
 
-            # print('metric_kw = {}'.format(ub.repr2(metric_kw, nl=1)))
-            # print('y_score = {!r}'.format(y_score))
             with warnings.catch_warnings():
                 warnings.filterwarnings('ignore', message='invalid .* true_divide')
-                ap = sklearn.metrics.average_precision_score(
-                    y_score=y_score, **metric_kw)
+                """
+                Notes:
+                    Apparently, consistent scoring is really hard to get right.
+
+                    For detection problems scoring via
+                    confusion_vectors+sklearn produces noticably different
+                    results than the VOC method. There are a few reasons for
+                    this.  The VOC method stops counting true positives after
+                    all assigned predicted boxes have been counted. It simply
+                    remembers the amount of original true positives to
+                    normalize the true positive reate. On the other hand,
+                    confusion vectors maintains a list of these unassigned true
+                    boxes and gives them a predicted index of -1 and a score of
+                    zero. This means that this function sees them as having a
+                    y_true of 1 and a y_score of 0, which allows the
+                    scikit-learn fps and tps counts to effectively get up to
+                    100% recall when the threshold is zero. The VOC method
+                    simply ignores these and handles them implicitly. The
+                    problem is that if you remove these from the scikit-learn
+                    inputs, it wont see the correct number of positives and it
+                    will incorrectly normalize the recall.  In summary:
+
+                        VOC:
+                            * remembers realpos_total
+                            * doesn't count unassigned truths as TP when the
+                            threshold is zero.
+
+                        CV+SKL:
+                            * counts unassigned truths as TP with score=0.
+                            * Always ensure tpr=1, ppv=0 and ppv=1, tpr=0 cases
+                            exist.
+                """
+
+                if method.startswith('voc'):
+                    y_score_ = y_score[y_score > 0]
+                    y_true_ = y_true[y_score > 0]
+                    fps, tps, _thresholds = _binary_clf_curve(
+                        y_true_, y_score_, pos_label=1.0,
+                        sample_weight=sample_weight)
+                elif method == 'sklearn':
+                    fps, tps, _thresholds = _binary_clf_curve(
+                        y_true, y_score, pos_label=1.0,
+                        sample_weight=sample_weight)
+                else:
+                    raise KeyError(method)
+
+                # Slight tweak to sklearn.metrics.precision_recall_curve
+                fns = realpos_total - tps
 
-                fps, tps, _thresholds = _binary_clf_curve(
-                    y_true, y_score, pos_label=1.0,
-                    sample_weight=sample_weight)
                 precision = tps / (tps + fps)
                 precision[np.isnan(precision)] = 0
-                recall = tps / tps[-1]
+                recall = tps / realpos_total
 
                 # stop when full recall attained
                 # and reverse the outputs so recall is decreasing
@@ -689,25 +741,13 @@ class BinaryConfusionVectors(ub.NiceRepr):
                     np.r_[recall[sl], 0],
                     _thresholds[sl])
 
-                # prec, rec, thresholds = sklearn.metrics.precision_recall_curve(
-                #     probas_pred=y_score, **metric_kw)
-
-        # FIXME
-        # USING true == pred IS WRONG.
-        # when pred=-1 and true=0 the score=0, but is_true=False.
-        # THIS CAUSES the total number of TRUE sklearn vecs to be incorrect
-
-        # Get the total weight (typically number of) positive and negative
-        # examples of this class
-        if sample_weight is None:
-            weight = 1
-            nsupport = len(y_true) - bool(npad)
-        else:
-            weight = sample_weight
-            nsupport = sample_weight.sum() - bool(npad)
-
-        realpos_total = (y_true * weight).sum()
-        realneg_total = ((1 - y_true) * weight).sum()
+                if method.startswith('voc'):
+                    from netharn.metrics.voc_metrics import _voc_ave_precision
+                    ap = _voc_ave_precision(rec[::-1], prec[::-1], method=method)
+                elif method == 'sklearn':
+                    ap = sklearn.metrics.average_precision_score(
+                        y_score=y_score, y_true=y_true,
+                        sample_weight=sample_weight)
 
         prs_info = {
             'ap': ap,
@@ -715,6 +755,7 @@ class BinaryConfusionVectors(ub.NiceRepr):
             'tpr': rec,    # (true positive rate) == (recall)
             'fp_count': fps,
             'tp_count': tps,
+            'fn_count': fns,
             'thresholds': thresholds,
             'nsupport': nsupport,
             'realpos_total': realpos_total,
@@ -869,6 +910,9 @@ class DictProxy(DictLike):
     def keys(self):
         return self.proxy.keys()
 
+    def __json__(self):
+        return ub.odict(self.proxy)
+
 
 class ROC_Result(ub.NiceRepr, DictProxy):
     """
diff --git a/netharn/metrics/detect_metrics.py b/netharn/metrics/detect_metrics.py
index a177fa3790d446a8001c7b636fa0f5a905656774..cde905beabfc7ef22498f219e3583716be576d3e 100644
--- a/netharn/metrics/detect_metrics.py
+++ b/netharn/metrics/detect_metrics.py
@@ -351,7 +351,7 @@ class DetectionMetrics(ub.NiceRepr):
         if gids is None:
             gids = sorted(dmet._imgname_to_gid.values())
         # Convert true/pred detections into VOC format
-        vmet = voc_metrics.VOC_Metrics()
+        vmet = voc_metrics.VOC_Metrics(classes=dmet.classes)
         for gid in gids:
             true_dets = dmet.true_detections(gid)
             pred_dets = dmet.pred_detections(gid)
diff --git a/netharn/metrics/voc_metrics.py b/netharn/metrics/voc_metrics.py
index c6c309345aa4f2f97c985de5f738495fbb6f95c5..7f9699aa358cf2d7264d7f49bfe7c9f8e369808d 100644
--- a/netharn/metrics/voc_metrics.py
+++ b/netharn/metrics/voc_metrics.py
@@ -20,9 +20,10 @@ class VOC_Metrics(ub.NiceRepr):
             Each "line" is a list of [
                 [<imgid>, <score>, <tl_x>, <tl_y>, <br_x>, <br_y>]].
     """
-    def __init__(self):
+    def __init__(self, classes=None):
         self.recs = {}
         self.cx_to_lines = ub.ddict(list)
+        self.classes = classes
 
     def __nice__(self):
         info = {
@@ -60,14 +61,47 @@ class VOC_Metrics(ub.NiceRepr):
     def score(self, ovthresh=0.5, bias=1, method='voc2012'):
         """
         Compute VOC scores for every category
+
+        Example:
+            >>> from netharn.metrics.detect_metrics import DetectionMetrics
+            >>> from netharn.metrics.voc_metrics import *  # NOQA
+            >>> dmet = DetectionMetrics.demo(
+            >>>     nimgs=1, nboxes=(0, 100), n_fp=(0, 30), n_fn=(0, 30), nclasses=2, score_noise=0.9)
+            >>> self = VOC_Metrics(classes=dmet.classes)
+            >>> self.add_truth(dmet.true_detections(0), 0)
+            >>> self.add_predictions(dmet.pred_detections(0), 0)
+            >>> voc_scores = self.score()
+            >>> # xdoctest: +REQUIRES(--show)
+            >>> import kwplot
+            >>> kwplot.autompl()
+            >>> kwplot.figure(fnum=1, doclf=True)
+            >>> voc_scores['perclass'].draw()
+
+            kwplot.figure(fnum=2)
+            dmet.true_detections(0).draw(color='green', labels=None)
+            dmet.pred_detections(0).draw(color='blue', labels=None)
+            kwplot.autoplt().gca().set_xlim(0, 100)
+            kwplot.autoplt().gca().set_ylim(0, 100)
         """
+        from netharn.metrics.confusion_vectors import PR_Result
+        from netharn.metrics.confusion_vectors import PerClass_PR_Result
         perclass = {}
         for cx in self.cx_to_lines.keys():
             lines = self.cx_to_lines[cx]
             classname = cx
-            info = _voc_eval(lines, self.recs, classname, ovthresh=ovthresh,
-                             bias=bias, method=method)
-            perclass[cx] = info
+            roc_info = _voc_eval(lines, self.recs, classname,
+                                 ovthresh=ovthresh, bias=bias, method=method)
+            roc_info['cx'] = cx
+            if self.classes is not None:
+                catname = self.classes[cx]
+                roc_info.update({
+                    'node': catname,
+                })
+                perclass[catname] = PR_Result(roc_info)
+            else:
+                perclass[cx] = PR_Result(roc_info)
+
+        perclass = PerClass_PR_Result(perclass)
 
         mAP = np.nanmean([d['ap'] for d in perclass.values()])
         voc_scores = {
@@ -304,14 +338,22 @@ def _voc_eval(lines, recs, classname, ovthresh=0.5, method='voc2012',
 
         ap = _voc_ave_precision(rec=rec, prec=prec, method=method)
 
+    # number of supports is the number of real positives + unassigned preds
+    realneg_total = fp[-1]  # number of unassigned predictions
+    realpos_total = npos  # number of truth predictions
+    nsupport = realneg_total + realpos_total
+
     info = {
-        'fp': fp,
-        'tp': tp,
-        'fn': fn,
+        'fp_count': fp,
+        'tp_count': tp,
+        'fn_count': fn,
         'tpr': rec,    # (true positive rate) == (recall)
         'ppv': prec,  # (positive predictive value) == (precision)
         'thresholds': thresholds,
         'npos': npos,
+        'nsupport': nsupport,
+        'realpos_total': realpos_total,
+        'realneg_total': realneg_total,
         'ap': ap,
     }
     return info
diff --git a/netharn/mixins.py b/netharn/mixins.py
index 37804ad65cf205dd1b24f9bafef2af67a3e5990a..51ab82afc51f833b8ef4fd83118d83f7dfae40ba 100644
--- a/netharn/mixins.py
+++ b/netharn/mixins.py
@@ -146,13 +146,13 @@ def _redump_measures(dpath):
     _dump_measures(tb_data, out_dpath, mode)
 
 
-def _dump_measures(tb_data, out_dpath, mode=None, smoothing=0.6,
+def _dump_measures(tb_data, out_dpath, mode=None, smoothing=0.0,
                    ignore_outliers=True):
     """
     This is its own function in case we need to modify formatting
 
     CommandLine:
-        xdoctest -m netharn.mixins _dump_measures
+        xdoctest -m netharn.mixins _dump_measures --out_dpath=.
 
     Example:
         >>> # SCRIPT
@@ -162,16 +162,22 @@ def _dump_measures(tb_data, out_dpath, mode=None, smoothing=0.6,
         >>> import json
         >>> from os.path import join
         >>> import ubelt as ub
+        >>> try:
+        >>>     import seaborn as sns
+        >>>     sns.set()
+        >>> except ImportError:
+        >>>     pass
         >>> out_dpath = ub.expandpath('~/work/project/fit/nice/nicename/monitor/tensorboard/')
         >>> out_dpath = ub.argval('--out_dpath', default=out_dpath)
-        >>> mode = 'iter'
+        >>> mode = ['epoch', 'iter']
         >>> fpath = join(out_dpath, 'tb_data.json')
         >>> tb_data = json.load(open(fpath, 'r'))
-        >>> _dump_measures(tb_data,  out_dpath)
+        >>> _dump_measures(tb_data,  out_dpath, smoothing=0)
     """
     import ubelt as ub
     from os.path import join
     import numpy as np
+
     import kwplot
     kwplot.autompl()
 
diff --git a/netharn/models/deeplab_v3.py b/netharn/models/deeplab_v3.py
index 1b6e764972d9fcc8b29c59c30ce9690950602452..69ba1553fb128e20b8910179551304268d1a8456 100644
--- a/netharn/models/deeplab_v3.py
+++ b/netharn/models/deeplab_v3.py
@@ -78,7 +78,7 @@ class _Bottleneck(layers.AnalyticModule):
                           **kwargs):
         """
         Example:
-            >>> # xdoctset: +REQUIRES(--slow)
+            >>> # xdoctest: +REQUIRES(--slow)
             >>> from netharn.models.deeplab_v3 import *  # NOQA
             >>> from netharn.models.deeplab_v3 import _Bottleneck
             >>> import netharn as nh
@@ -194,7 +194,7 @@ class _Flatten(layers.AnalyticModule):
                           **kwargs):
         """
         Example:
-            >>> # xdoctset: +REQUIRES(--slow)
+            >>> # xdoctest: +REQUIRES(--slow)
             >>> from netharn.models.deeplab_v3 import *  # NOQA
             >>> from netharn.models.deeplab_v3 import _Flatten
             >>> import netharn as nh
@@ -283,7 +283,7 @@ class _ImagePool(layers.AnalyticModule):
                           **kwargs):
         """
         Example:
-            >>> # xdoctset: +REQUIRES(--slow)
+            >>> # xdoctest: +REQUIRES(--slow)
             >>> from netharn.models.deeplab_v3 import *  # NOQA
             >>> from netharn.models.deeplab_v3 import _ImagePool
             >>> import netharn as nh
@@ -351,7 +351,7 @@ class _ASPP(layers.AnalyticModule):
                           **kwargs):
         """
         Example:
-            >>> # xdoctset: +REQUIRES(--slow)
+            >>> # xdoctest: +REQUIRES(--slow)
             >>> from netharn.models.deeplab_v3 import *  # NOQA
             >>> from netharn.models.deeplab_v3 import _ASPP
             >>> import netharn as nh
@@ -388,8 +388,7 @@ class DeepLabV3(layers.Sequential):
     DeepLab v3: Dilated ResNet with multi-grid + improved ASPP
 
     Example:
-        >>> # xdoctset: +REQUIRES(--slow)
-        >>> # xdoctset: +SKIP
+        >>> # xdoctest: +REQUIRES(--slow)
         >>> from netharn.models.deeplab_v3 import *  # NOQA
         >>> self = DeepLabV3(classes=21).eval()
         >>> ####
diff --git a/netharn/models/dual_path_net.py b/netharn/models/dual_path_net.py
index d5b49871d494fd188f3c734d9ca5d7051b91f860..a2645f5b4f124d817846e52bfc66edcec68d6e09 100644
--- a/netharn/models/dual_path_net.py
+++ b/netharn/models/dual_path_net.py
@@ -9,7 +9,7 @@ References:
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-# from netharn.output_shape_for import OutputShapeFor
+# from netharn.analytic.output_shape_for import OutputShapeFor
 from netharn.layers import ConvNorm2d
 
 # __all__ = ['DPN']
diff --git a/netharn/models/efficientnet.py b/netharn/models/efficientnet.py
index 7b07deb0ac9705cbab391dd101a9fce967b67598..1fe0021c7f7028bbe2aae2df2dc7c99a387fe25b 100644
--- a/netharn/models/efficientnet.py
+++ b/netharn/models/efficientnet.py
@@ -17,11 +17,14 @@ from functools import partial
 from torch.utils import model_zoo
 
 
-class Conv2dDynamicSamePadding(nn.Conv2d):
+class Conv2dDynamicSamePadding(nn.Conv2d, layers.AnalyticModule):
     """ 2D Convolutions like TensorFlow, for a dynamic image size """
 
     def __init__(self, in_channels, out_channels, kernel_size, stride=1, dilation=1, groups=1, bias=True):
-        super(Conv2dDynamicSamePadding, self).__init__(in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+        super(Conv2dDynamicSamePadding, self).__init__(in_channels,
+                                                       out_channels,
+                                                       kernel_size, stride, 0,
+                                                       dilation, groups, bias)
         self.stride = self.stride if len(self.stride) == 2 else [self.stride[0]] * 2
 
     def forward(self, x):
@@ -44,8 +47,48 @@ class Conv2dDynamicSamePadding(nn.Conv2d):
         else:
             return partial(Conv2dStaticSamePadding, image_size=image_size)
 
+    def _analytic_forward(self, inputs, _OutputFor, _Output, _Hidden,
+                          **kwargs):
+        """
+        Example:
+            >>> # xdoctest: +REQUIRES(module:ndsampler)
+            >>> from netharn.models.efficientnet import *  # NOQA
+            >>> import netharn as nh
+            >>> kwargs = layers.AnalyticModule._analytic_shape_kw()
+            >>> globals().update(kwargs)
+            >>> inputs = (1, 3, 224, 224)
+            >>> self = Conv2dDynamicSamePadding(2, 3, 5)
+            >>> outputs = self.output_shape_for(inputs)
+            >>> import ubelt as ub
+            >>> print(nh.util.align(ub.repr2(outputs.hidden, nl=-1), ':'))
+        """
+        hidden = _Hidden()
+        x = inputs
+        ih, iw = _OutputFor.shape(x)[-2:]
+        kh, kw = self.weight.size()[-2:]
+        sh, sw = self.stride
+        oh, ow = math.ceil(ih / sh), math.ceil(iw / sw)
+        pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
+        pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        if pad_h > 0 or pad_w > 0:
+            pad = [pad_w // 2, pad_w - pad_w // 2,
+                   pad_h // 2, pad_h - pad_h // 2]
+            x = hidden['dynamic_padding'] = _OutputFor(F.pad)(x, pad)
+
+        weight = self.weight
+        bias = self.bias is not None
+        stride = self.stride
+        padding = self.padding
+        dilation = self.dilation
+        groups = self.groups
+
+        y = hidden['conv'] = _OutputFor(F.conv2d)(x, weight, bias, stride,
+                                                  padding, dilation, groups)
+        outputs = _Output.coerce(y, hidden)
+        return outputs
 
-class Conv2dStaticSamePadding(nn.Conv2d):
+
+class Conv2dStaticSamePadding(nn.Conv2d, layers.AnalyticModule):
     """ 2D Convolutions like TensorFlow, for a fixed image size"""
 
     def __init__(self, in_channels, out_channels, kernel_size, image_size=None, **kwargs):
@@ -60,6 +103,10 @@ class Conv2dStaticSamePadding(nn.Conv2d):
         oh, ow = int(math.ceil(ih / sh)), int(math.ceil(iw / sw))
         pad_h = max((oh - 1) * self.stride[0] + (kh - 1) * self.dilation[0] + 1 - ih, 0)
         pad_w = max((ow - 1) * self.stride[1] + (kw - 1) * self.dilation[1] + 1 - iw, 0)
+        self.image_size = image_size
+        self._pad = (pad_h, pad_w)
+        self._pad_w = pad_w
+        self._pad_h = pad_w
         if pad_h > 0 or pad_w > 0:
             self.static_padding = nn.ZeroPad2d((pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2))
         else:
@@ -70,6 +117,30 @@ class Conv2dStaticSamePadding(nn.Conv2d):
         x = F.conv2d(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
         return x
 
+    def _analytic_forward(self, inputs, _OutputFor, _Output, _Hidden,
+                          **kwargs):
+        """
+        Example:
+            >>> # xdoctest: +REQUIRES(module:ndsampler)
+            >>> from netharn.models.efficientnet import *  # NOQA
+            >>> import netharn as nh
+            >>> kwargs = layers.AnalyticModule._analytic_shape_kw()
+            >>> globals().update(kwargs)
+            >>> inputs = (1, 3, 224, 224)
+            >>> self = Conv2dStaticSamePadding(2, 3, 5, image_size=[512, 512])
+            >>> outputs = self.output_shape_for(inputs)
+            >>> import ubelt as ub
+            >>> print(nh.util.align(ub.repr2(outputs.hidden, nl=-1), ':'))
+        """
+        hidden = _Hidden()
+        x = inputs
+        x = hidden['static_padding'] = _OutputFor(self.static_padding)(x)
+        y = hidden['conv'] = _OutputFor(F.conv2d)(
+            x, self.weight, self.bias is not None, self.stride, self.padding,
+            self.dilation, self.groups)
+        outputs = _Output.coerce(y, hidden)
+        return outputs
+
 
 ##################
 # Model definition
@@ -80,8 +151,8 @@ class MBConvBlock(layers.AnalyticModule):
     Mobile Inverted Residual Bottleneck Block
 
     Args:
-        block_args (BlockArgs): see above
-        global_params (GlobalParam): see above
+        block_args (BlockArgs): see :class:`Details`
+        global_params (GlobalParam): see :class:`Details`
 
     Attributes:
         has_se (bool): Whether the block contains a Squeeze and Excitation layer.
@@ -108,6 +179,8 @@ class MBConvBlock(layers.AnalyticModule):
         # Depthwise convolution phase
         k = self._block_args.kernel_size
         s = self._block_args.stride
+        # Note: it is important to set the weight decay to be very low for the
+        # depthwise convolutions
         self._depthwise_conv = Conv2d(
             in_channels=oup, out_channels=oup, groups=oup,  # groups makes it depthwise
             kernel_size=k, stride=s, bias=False)
@@ -122,9 +195,12 @@ class MBConvBlock(layers.AnalyticModule):
         # Output phase
         final_oup = self._block_args.output_filters
         self._project_conv = Conv2d(in_channels=oup, out_channels=final_oup, kernel_size=1, bias=False)
+        # Note that the bn2 layer before the residual add, should be
+        # initailized with gamma=0
         self._bn2 = nn.BatchNorm2d(num_features=final_oup, momentum=self._bn_mom, eps=self._bn_eps)
-        noli = 'swish'
-        self._swish = layers.rectify_nonlinearity(noli, dim=2)
+        self._bn2._residual_bn = True
+        noli = global_params.noli
+        self._noli = layers.rectify_nonlinearity(noli, dim=2)
 
     def forward(self, inputs, drop_connect_rate=None):
         """
@@ -136,13 +212,13 @@ class MBConvBlock(layers.AnalyticModule):
         # Expansion and Depthwise Convolution
         x = inputs
         if self._block_args.expand_ratio != 1:
-            x = self._swish(self._bn0(self._expand_conv(inputs)))
-        x = self._swish(self._bn1(self._depthwise_conv(x)))
+            x = self._noli(self._bn0(self._expand_conv(inputs)))
+        x = self._noli(self._bn1(self._depthwise_conv(x)))
 
         # Squeeze and Excitation
         if self.has_se:
             x_squeezed = F.adaptive_avg_pool2d(x, 1)
-            x_squeezed = self._se_expand(self._swish(self._se_reduce(x_squeezed)))
+            x_squeezed = self._se_expand(self._noli(self._se_reduce(x_squeezed)))
             x = torch.sigmoid(x_squeezed) * x
 
         x = self._bn2(self._project_conv(x))
@@ -155,6 +231,69 @@ class MBConvBlock(layers.AnalyticModule):
             x = x + inputs  # skip connection
         return x
 
+    @classmethod
+    def demo(MBConvBlock):
+        layer_block_args, global_params = Details.build_efficientnet_params()
+        block_args = layer_block_args[0]
+        self = MBConvBlock(block_args, global_params)
+        return self
+
+    def _analytic_forward(self, inputs, _OutputFor, _Output, _Hidden,
+                          **kwargs):
+        """
+        Example:
+            >>> # xdoctest: +REQUIRES(module:ndsampler)
+            >>> from netharn.models.efficientnet import *  # NOQA
+            >>> import netharn as nh
+            >>> self = MBConvBlock.demo()
+            >>> kwargs = self._analytic_shape_kw()
+            >>> globals().update(kwargs)
+            >>> input_shape = inputs = (1, 32, 224, 224)
+            >>> outputs = self.output_shape_for(input_shape)
+            >>> import ubelt as ub
+            >>> print(nh.util.align(ub.repr2(outputs.hidden, nl=-1), ':'))
+        """
+        hidden = _Hidden()
+
+        # Expansion and Depthwise Convolution
+        x = inputs
+        if self._block_args.expand_ratio != 1:
+            x = hidden['expand_conv'] = _OutputFor(self._expand_conv)(inputs)
+            x = hidden['_bn0'] = _OutputFor(self._bn0)(x)
+            x = hidden['_noli0'] = _OutputFor(self._noli)(x)
+
+        x = hidden['depthwise_conv'] = _OutputFor(self._depthwise_conv)(x)
+        x = hidden['_bn1'] = _OutputFor(self._bn1)(x)
+        x = hidden['_noli1'] = _OutputFor(self._noli)(x)
+
+        # Squeeze and Excitation
+        if self.has_se:
+            x_squeezed = hidden['_se_pool'] = _OutputFor(F.adaptive_avg_pool2d)(x, 1)
+            x_squeezed = hidden['_se_reduce'] =  _OutputFor(self._se_reduce)(x_squeezed)
+            x_squeezed = hidden['_se_noli'] = _OutputFor(self._noli)(x_squeezed)
+            x_squeezed = hidden['_se_expand'] = _OutputFor(self._se_expand)(x_squeezed)
+            x_squeezed = hidden['_se_sigmoid'] = _OutputFor(torch.sigmoid)(x_squeezed)
+            x = hidden['_se_mul'] = _OutputFor.mul(x_squeezed, x)
+
+        x = hidden['_project'] = _OutputFor(self._project_conv)(x)
+        x = hidden['_bn2'] = _OutputFor(self._bn2)(x)
+
+        # Skip connection and drop connect
+        input_filters, output_filters = self._block_args.input_filters, self._block_args.output_filters
+        if self.id_skip and self._block_args.stride == 1 and input_filters == output_filters:
+            drop_connect_rate = kwargs.get('drop_connect_rate', 0)
+            if drop_connect_rate:
+                try:
+                    x = self.drop_connect(x, p=drop_connect_rate)
+                except Exception:
+                    pass
+                hidden['drop_connect'] = x
+
+            # skip connection
+            x = hidden['skip'] = _OutputFor.add(x, inputs)
+        outputs = _Output.coerce(x, hidden)
+        return outputs
+
     def drop_connect(self, inputs, p):
         """ Drop connect. """
         if not self.training:
@@ -213,6 +352,8 @@ class EfficientNet(layers.AnalyticModule):
         tmp['classes'] = self.classes.__json__()
         self._global_params = type(global_params)(**tmp)
 
+        self.image_size = self._global_params._asdict()['image_size']
+
         # import ubelt as ub
         # print(ub.repr2(self._global_params._asdict(), nl=-4))
         # print(ub.repr2(self._global_params._asdict()))
@@ -273,8 +414,8 @@ class EfficientNet(layers.AnalyticModule):
         self._avg_pooling = nn.AdaptiveAvgPool2d(1)
         self._dropout = nn.Dropout(self._global_params.dropout_rate)
         self._fc = nn.Linear(out_channels, self._global_params.num_classes)
-        noli = 'swish'
-        self._swish = layers.rectify_nonlinearity(noli, dim=2)
+        noli = global_params.noli
+        self._noli = layers.rectify_nonlinearity(noli, dim=2)
 
     def round_filters(self, filters):
         """ Calculate and round number of filters based on depth multiplier. """
@@ -306,7 +447,7 @@ class EfficientNet(layers.AnalyticModule):
         """
         # Stem
         x = self._conv_stem(inputs)
-        x = self._swish(self._bn0(x))
+        x = self._noli(self._bn0(x))
 
         # Blocks
         for idx, block in enumerate(self._blocks):
@@ -316,7 +457,7 @@ class EfficientNet(layers.AnalyticModule):
             x = block(x, drop_connect_rate=drop_connect_rate)
 
         # Head
-        x = self._swish(self._bn1(self._conv_head(x)))
+        x = self._noli(self._bn1(self._conv_head(x)))
 
         return x
 
@@ -325,21 +466,69 @@ class EfficientNet(layers.AnalyticModule):
         """
         Example:
             >>> # xdoctest: +REQUIRES(module:ndsampler)
+            >>> import netharn as nh
             >>> from netharn.models.efficientnet import *  # NOQA
             >>> self = EfficientNet.from_name('efficientnet-b0')
             >>> kwargs = self._analytic_shape_kw()
             >>> globals().update(kwargs)
             >>> inputs = (1, 3, 224, 224)
+            >>> inputs = (1, 3, 32, 32)
+            >>> outputs = self.output_shape_for(inputs)
+            >>> import ubelt as ub
+            >>> print(nh.util.align(ub.repr2(outputs.hidden.shallow(1), nl=-1), ':'))
+
+            >>> print(nh.util.align(ub.repr2(outputs.hidden['block_0'].shallow(2), nl=-1), ':'))
+            >>> print(nh.util.align(ub.repr2(outputs.hidden['block_1'].shallow(2), nl=-1), ':'))
+            >>> print(nh.util.align(ub.repr2(outputs.hidden['block_2'].shallow(2), nl=-1), ':'))
+            >>> print(nh.util.align(ub.repr2(outputs.hidden['block_3'].shallow(2), nl=-1), ':'))
+            >>> print(nh.util.align(ub.repr2(outputs.hidden['block_14'].shallow(2), nl=-1), ':'))
+            >>> print(nh.util.align(ub.repr2(outputs.hidden['block_15'].shallow(2), nl=-1), ':'))
+
+            >>> self = EfficientNet.from_name('efficientnet-b7')
+            >>> print('self.image_size = {!r}'.format(self.image_size))
+            >>> self = EfficientNet.from_name('efficientnet-b6')
+            >>> print('self.image_size = {!r}'.format(self.image_size))
+            >>> self = EfficientNet.from_name('efficientnet-b3')
+            >>> print('self.image_size = {!r}'.format(self.image_size))
+            >>> self = EfficientNet.from_name('efficientnet-b2')
+            >>> print('self.image_size = {!r}'.format(self.image_size))
+            >>> self = EfficientNet.from_name('efficientnet-b1')
+            >>> print('self.image_size = {!r}'.format(self.image_size))
+            >>> self = EfficientNet.from_name('efficientnet-b0')
+            >>> print('self.image_size = {!r}'.format(self.image_size))
+
+            >>> inputs = (1, 3, 224, 224)
+            >>> self = EfficientNet.from_name('efficientnet-b7')
+            >>> print('self.image_size = {!r}'.format(self.image_size))
+            >>> outputs = self.output_shape_for(inputs)
+            >>> print(nh.util.align(ub.repr2(outputs.hidden.shallow(1), nl=-1), ':'))
+
+            for name, layer in nh.util.trainable_layers(self, names=1):
+                if hasattr(layer, 'image_size'):
+                    print('name = {!r}'.format(name))
+                    print('layer = {!r}'.format(layer))
+                    print('layer.image_size = {!r}'.format(layer.image_size))
+
+            >>> inputs = (1, 3, 224, 224)
+            >>> self = EfficientNet.from_name('efficientnet-b0')
+            >>> outputs = self.output_shape_for(inputs)
+            >>> print(nh.util.align(ub.repr2(outputs.hidden.shallow(1), nl=-1), ':'))
+
+            for name, layer in nh.util.trainable_layers(self, names=1):
+                if hasattr(layer, 'image_size'):
+                    print('name = {!r}'.format(name))
+                    print('layer = {!r}'.format(layer))
+                    print('layer.image_size = {!r}'.format(layer.image_size))
         """
         hidden = _Hidden()
 
         # NEEDS MORE BACKEND WORK
 
-        bs = inputs.size(0)
+        bs = _OutputFor.shape(inputs)[0]
 
         x = inputs
         x = hidden['_conv_stem'] = _OutputFor(self._conv_stem)(x)
-        x = hidden['_swish1'] = _OutputFor(self._swish)(x)
+        x = hidden['_noli1'] = _OutputFor(self._noli)(x)
 
         for idx, block in enumerate(self._blocks):
             drop_connect_rate = self._global_params.drop_connect_rate
@@ -348,11 +537,11 @@ class EfficientNet(layers.AnalyticModule):
             x = hidden['block_{}'.format(idx)] = _OutputFor(block)(
                 x, drop_connect_rate=drop_connect_rate)
 
-        x = hidden['_swish2'] = _OutputFor(self._swish)(x)
+        x = hidden['_noli2'] = _OutputFor(self._noli)(x)
 
         # Pooling and final linear layer
         x = _OutputFor(self._avg_pooling)(x)
-        x = _OutputFor(x.view)(bs, -1)
+        x = _OutputFor.view(x, bs, -1)
         x = _OutputFor(self._dropout)(x)
         x = _OutputFor(self._fc)(x)
         outputs = _Output.coerce(x, hidden)
@@ -376,6 +565,14 @@ class EfficientNet(layers.AnalyticModule):
 
     # TODO: Analytic forward
 
+    @classmethod
+    def from_params(cls, width, depth, size, dropout, **override_params):
+        # note: all models have drop connect rate = 0.2
+        blocks_args, global_params = Details.build_efficientnet_params(
+            width_coefficient=width, depth_coefficient=depth,
+            dropout_rate=dropout, image_size=size)
+        global_params = global_params._replace(**override_params)
+
     @classmethod
     def from_name(EfficientNet, model_name, override_params=None):
         """
@@ -391,17 +588,28 @@ class EfficientNet(layers.AnalyticModule):
         return self
 
     @classmethod
-    def from_pretrained(EfficientNet, model_name, advprop=False, override_params=None, in_channels=3):
+    def from_pretrained(EfficientNet, model_name, advprop=False,
+                        override_params=None, in_channels=3):
         """
         Initialize the model from a pretrained state
 
         Example:
             >>> # xdoctest: +REQUIRES(--download)
             >>> # xdoctest: +REQUIRES(module:ndsampler)
-            >>> from netharn.models.efficentnet import *  # NOQA
+            >>> from netharn.models.efficientnet import *  # NOQA
             >>> model = EfficientNet.from_pretrained('efficientnet-b0')
             >>> inputs = torch.rand(1, 3, 224, 224)
             >>> outputs = model.forward(inputs)
+
+            >>> from netharn.models.efficientnet import *  # NOQA
+            >>> model = EfficientNet.from_pretrained('efficientnet-b0', override_params={'noli': 'mish'}, advprop=True)
+            >>> model = EfficientNet.from_pretrained('efficientnet-b1', override_params={'noli': 'mish'}, advprop=True)
+            >>> model = EfficientNet.from_pretrained('efficientnet-b2', override_params={'noli': 'mish'}, advprop=True)
+            >>> model = EfficientNet.from_pretrained('efficientnet-b3', override_params={'noli': 'mish'}, advprop=True)
+            >>> model = EfficientNet.from_pretrained('efficientnet-b4', override_params={'noli': 'mish'}, advprop=True)
+            >>> model = EfficientNet.from_pretrained('efficientnet-b5', override_params={'noli': 'mish'}, advprop=True)
+            >>> model = EfficientNet.from_pretrained('efficientnet-b6', override_params={'noli': 'mish'}, advprop=True)
+            >>> model = EfficientNet.from_pretrained('efficientnet-b7', override_params={'noli': 'mish'}, advprop=True)
         """
         if override_params is None:
             override_params = {}
@@ -430,27 +638,57 @@ class Details(object):
     """
 
     # Parameters for the entire model (stem, all blocks, and head)
+    # url_map = {
+    #     'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b0-355c32eb.pth',
+    #     'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b1-f1951068.pth',
+    #     'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b2-8bb594d6.pth',
+    #     'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b3-5fb5a3c3.pth',
+    #     'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b4-6ed6700e.pth',
+    #     'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b5-b6417697.pth',
+
+    #     # 'efficientnet-b1': 'https://www.dropbox.com/s/6745ear79b1ltkh/efficientnet-b1-ef6aa7.pth?dl=1',
+    #     # 'efficientnet-b2': 'https://www.dropbox.com/s/0dhtv1t5wkjg0iy/efficientnet-b2-7c98aa.pth?dl=1',
+    #     # 'efficientnet-b3': 'https://www.dropbox.com/s/5uqok5gd33fom5p/efficientnet-b3-bdc7f4.pth?dl=1',
+    #     # 'efficientnet-b4': 'https://www.dropbox.com/s/y2nqt750lixs8kc/efficientnet-b4-3e4967.pth?dl=1',
+    #     # 'efficientnet-b5': 'https://www.dropbox.com/s/qxonlu3q02v9i47/efficientnet-b5-4c7978.pth?dl=1',
+
+    #     'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b6-c76e70fd.pth',
+    #     'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b7-dcc49843.pth',
+    # }
+
+    # url_map_advprop = {
+    #     'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b0-b64d5a18.pth',
+    #     'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b1-0f3ce85a.pth',
+    #     'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b2-6e9d97e5.pth',
+    #     'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b3-cdd7c0f4.pth',
+    #     'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b4-44fb3a87.pth',
+    #     'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b5-86493f6b.pth',
+    #     'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b6-ac80338e.pth',
+    #     'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b7-4652b6dd.pth',
+    #     'efficientnet-b8': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b8-22a8fe65.pth',
+    # }
+
     url_map = {
-        'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b0-355c32eb.pth',
-        'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b1-f1951068.pth',
-        'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b2-8bb594d6.pth',
-        'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b3-5fb5a3c3.pth',
-        'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b4-6ed6700e.pth',
-        'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b5-b6417697.pth',
-        'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b6-c76e70fd.pth',
-        'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/aa/efficientnet-b7-dcc49843.pth',
+        'efficientnet-b0': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth',
+        'efficientnet-b1': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b1-f1951068.pth',
+        'efficientnet-b2': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b2-8bb594d6.pth',
+        'efficientnet-b3': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b3-5fb5a3c3.pth',
+        'efficientnet-b4': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b4-6ed6700e.pth',
+        'efficientnet-b5': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b5-b6417697.pth',
+        'efficientnet-b6': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b6-c76e70fd.pth',
+        'efficientnet-b7': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b7-dcc49843.pth',
     }
 
     url_map_advprop = {
-        'efficientnet-b0': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b0-b64d5a18.pth',
-        'efficientnet-b1': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b1-0f3ce85a.pth',
-        'efficientnet-b2': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b2-6e9d97e5.pth',
-        'efficientnet-b3': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b3-cdd7c0f4.pth',
-        'efficientnet-b4': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b4-44fb3a87.pth',
-        'efficientnet-b5': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b5-86493f6b.pth',
-        'efficientnet-b6': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b6-ac80338e.pth',
-        'efficientnet-b7': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b7-4652b6dd.pth',
-        'efficientnet-b8': 'https://publicmodels.blob.core.windows.net/container/advprop/efficientnet-b8-22a8fe65.pth',
+        'efficientnet-b0': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b0-b64d5a18.pth',
+        'efficientnet-b1': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b1-0f3ce85a.pth',
+        'efficientnet-b2': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b2-6e9d97e5.pth',
+        'efficientnet-b3': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b3-cdd7c0f4.pth',
+        'efficientnet-b4': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b4-44fb3a87.pth',
+        'efficientnet-b5': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b5-86493f6b.pth',
+        'efficientnet-b6': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b6-ac80338e.pth',
+        'efficientnet-b7': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b7-4652b6dd.pth',
+        'efficientnet-b8': 'https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/adv-efficientnet-b8-22a8fe65.pth',
     }
 
     @classmethod
@@ -583,7 +821,7 @@ class Details(object):
         'batch_norm_momentum', 'batch_norm_epsilon', 'dropout_rate',
         'num_classes', 'width_coefficient', 'depth_coefficient',
         'depth_divisor', 'min_depth', 'drop_connect_rate', 'image_size',
-        'classes'])
+        'classes', 'noli'])
 
     # Change namedtuple defaults
     GlobalParams.__new__.__defaults__ = (None,) * len(GlobalParams._fields)
@@ -597,7 +835,7 @@ class Details(object):
         Creates a efficientnet parameters
 
         Example:
-            Details.build_efficientnet_params(0.1, 0.1, image_size=512)
+            Details.build_efficientnet_params(None, None, image_size=512)
         """
 
         blocks_args = [
@@ -613,7 +851,6 @@ class Details(object):
             batch_norm_epsilon=1e-3,
             dropout_rate=dropout_rate,
             drop_connect_rate=drop_connect_rate,
-            # data_format='channels_last',  # removed, this is always true in PyTorch
             num_classes=num_classes,
             width_coefficient=width_coefficient,
             depth_coefficient=depth_coefficient,
@@ -621,8 +858,8 @@ class Details(object):
             min_depth=None,
             image_size=image_size,
             classes=None,
+            noli='swish'
         )
-
         return blocks_args, global_params
 
     @staticmethod
diff --git a/netharn/models/se_resnet.py b/netharn/models/se_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c4b8ff8c252427e3df932d94b66fe6436124555
--- /dev/null
+++ b/netharn/models/se_resnet.py
@@ -0,0 +1,317 @@
+"""
+Refererences:
+    https://github.com/moskomule/senet.pytorch/blob/master/senet/se_resnet.py
+"""
+import torch.nn as nn
+from torch.hub import load_state_dict_from_url
+from torchvision.models import ResNet
+
+
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False),
+            nn.Sigmoid()
+        )
+
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        return x * y.expand_as(x)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)
+
+
+class SEBasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None,
+                 *, reduction=16):
+        super(SEBasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes, 1)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.se = SELayer(planes, reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class SEBottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
+                 base_width=64, dilation=1, norm_layer=None,
+                 *, reduction=16):
+        super(SEBottleneck, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
+                               padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.se = SELayer(planes * 4, reduction)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+        out = self.se(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+def se_resnet18(num_classes=1000):
+    """Constructs a ResNet-18 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(SEBasicBlock, [2, 2, 2, 2], num_classes=num_classes)
+    model.avgpool = nn.AdaptiveAvgPool2d(1)
+    return model
+
+
+def se_resnet34(num_classes=1000):
+    """Constructs a ResNet-34 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(SEBasicBlock, [3, 4, 6, 3], num_classes=num_classes)
+    model.avgpool = nn.AdaptiveAvgPool2d(1)
+    return model
+
+
+def se_resnet50(num_classes=1000, pretrained=False):
+    """Constructs a ResNet-50 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(SEBottleneck, [3, 4, 6, 3], num_classes=num_classes)
+    model.avgpool = nn.AdaptiveAvgPool2d(1)
+    if pretrained:
+        model.load_state_dict(load_state_dict_from_url(
+            "https://github.com/moskomule/senet.pytorch/releases/download/archive/seresnet50-60a8950a85b2b.pkl"))
+    return model
+
+
+def se_resnet101(num_classes=1000):
+    """Constructs a ResNet-101 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(SEBottleneck, [3, 4, 23, 3], num_classes=num_classes)
+    model.avgpool = nn.AdaptiveAvgPool2d(1)
+    return model
+
+
+def se_resnet152(num_classes=1000):
+    """Constructs a ResNet-152 model.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = ResNet(SEBottleneck, [3, 8, 36, 3], num_classes=num_classes)
+    model.avgpool = nn.AdaptiveAvgPool2d(1)
+    return model
+
+
+class CifarSEBasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, reduction=16):
+        super(CifarSEBasicBlock, self).__init__()
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.se = SELayer(planes, reduction)
+        if inplanes != planes:
+            self.downsample = nn.Sequential(nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False),
+                                            nn.BatchNorm2d(planes))
+        else:
+            self.downsample = lambda x: x
+        self.stride = stride
+
+    def forward(self, x):
+        residual = self.downsample(x)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.se(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class CifarSEResNet(nn.Module):
+    def __init__(self, block, n_size, num_classes=10, reduction=16):
+        super(CifarSEResNet, self).__init__()
+        self.inplane = 16
+        self.conv1 = nn.Conv2d(
+            3, self.inplane, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(self.inplane)
+        self.relu = nn.ReLU(inplace=True)
+        self.layer1 = self._make_layer(
+            block, 16, blocks=n_size, stride=1, reduction=reduction)
+        self.layer2 = self._make_layer(
+            block, 32, blocks=n_size, stride=2, reduction=reduction)
+        self.layer3 = self._make_layer(
+            block, 64, blocks=n_size, stride=2, reduction=reduction)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Linear(64, num_classes)
+        self.initialize()
+
+    def initialize(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, planes, blocks, stride, reduction):
+        strides = [stride] + [1] * (blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.inplane, planes, stride, reduction))
+            self.inplane = planes
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+        return x
+
+
+class CifarSEPreActResNet(CifarSEResNet):
+    def __init__(self, block, n_size, num_classes=10, reduction=16):
+        super(CifarSEPreActResNet, self).__init__(
+            block, n_size, num_classes, reduction)
+        self.bn1 = nn.BatchNorm2d(self.inplane)
+        self.initialize()
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        x = self.bn1(x)
+        x = self.relu(x)
+
+        x = self.avgpool(x)
+        x = x.view(x.size(0), -1)
+        x = self.fc(x)
+
+
+def se_resnet20(**kwargs):
+    """Constructs a ResNet-18 model.
+
+    """
+    model = CifarSEResNet(CifarSEBasicBlock, 3, **kwargs)
+    return model
+
+
+def se_resnet32(**kwargs):
+    """Constructs a ResNet-34 model.
+
+    """
+    model = CifarSEResNet(CifarSEBasicBlock, 5, **kwargs)
+    return model
+
+
+def se_resnet56(**kwargs):
+    """Constructs a ResNet-34 model.
+
+    """
+    model = CifarSEResNet(CifarSEBasicBlock, 9, **kwargs)
+    return model
+
+
+def se_preactresnet20(**kwargs):
+    """Constructs a ResNet-18 model.
+
+    """
+    model = CifarSEPreActResNet(CifarSEBasicBlock, 3, **kwargs)
+    return model
+
+
+def se_preactresnet32(**kwargs):
+    """Constructs a ResNet-34 model.
+
+    """
+    model = CifarSEPreActResNet(CifarSEBasicBlock, 5, **kwargs)
+    return model
+
+
+def se_preactresnet56(**kwargs):
+    """Constructs a ResNet-34 model.
+
+    """
+    model = CifarSEPreActResNet(CifarSEBasicBlock, 9, **kwargs)
+    return model
diff --git a/netharn/models/yolo2/yolo2.py b/netharn/models/yolo2/yolo2.py
index 144dfa16dedc0d120d2cb57bbada34442ba06d6b..d1b7e6b733d5d9956e14c9c72c73dc12a284e66d 100644
--- a/netharn/models/yolo2/yolo2.py
+++ b/netharn/models/yolo2/yolo2.py
@@ -259,11 +259,11 @@ class Yolo2(layers.AnalyticModule):
             >>> batch_dets = self.coder.decode_batch(output)
             >>> dets = batch_dets[0]
             >>> # xdoc: +REQUIRES(--show)
-            >>> import netharn as nh
-            >>> nh.util.autompl()  # xdoc: +SKIP
-            >>> nh.util.imshow(inputs[0], colorspace='rgb', fnum=1, doclf=True)
+            >>> import kwplot
+            >>> kwplot.autompl()  # xdoc: +SKIP
+            >>> kwplot.imshow(inputs[0], colorspace='rgb', fnum=1, doclf=True)
             >>> dets.scale(inputs.shape[-2:][::-1]).draw()
-            >>> nh.util.show_if_requested()
+            >>> kwplot.show_if_requested()
         """
         normed = self.input_norm(inputs)
         out0 = self.layers[0](normed)
@@ -451,10 +451,11 @@ class YoloCoder(object):
             >>> batch_dets = self.decode_batch(output)
             >>> dets = batch_dets[0]
             >>> # xdoctest: +REQUIRES(--show)
-            >>> nh.util.figure(fnum=1, doclf=True)
-            >>> nh.util.imshow(info['rgb255'], colorspace='rgb')
+            >>> import kwplot
+            >>> kwplot.figure(fnum=1, doclf=True)
+            >>> kwplot.imshow(info['rgb255'], colorspace='rgb')
             >>> dets.scale(info['orig_sizes'][0]).draw()
-            >>> nh.util.show_if_requested()
+            >>> kwplot.show_if_requested()
         """
         import kwimage
         # dont modify inplace
@@ -709,12 +710,13 @@ class YoloLoss(layers.common.Loss):
             >>> loss_parts = self.forward(output, target)
             >>> print('loss_parts = {!r}'.format(loss_parts))
             >>> # xdoctest: +REQUIRES(--show)
-            >>> nh.util.figure(fnum=1, doclf=True)
+            >>> import kwplot
+            >>> kwplot.figure(fnum=1, doclf=True)
             >>> sf = info['orig_sizes'][0]
             >>> dets.boxes.scale(sf, inplace=True)
-            >>> nh.util.imshow(info['rgb255'], colorspace='rgb')
+            >>> kwplot.imshow(info['rgb255'], colorspace='rgb')
             >>> dets.draw()
-            >>> nh.util.show_if_requested()
+            >>> kwplot.show_if_requested()
         """
         class_energy = output['class_energy']
         score_energy = output['score_energy']
@@ -1063,10 +1065,9 @@ def find_anchors(dset):
         >>> xy = -anchors / 2
         >>> wh = anchors
         >>> show_boxes = np.hstack([xy, wh])
-        >>> import netharn as nh
-        >>> nh.util.figure(doclf=True, fnum=1)
-        >>> nh.util.autompl()  # xdoc: +SKIP
-        >>> nh.util.draw_boxes(show_boxes, box_format='tlwh')
+        >>> kwplot.autompl()  # xdoc: +SKIP
+        >>> kwplot.figure(doclf=True, fnum=1)
+        >>> kwplot.draw_boxes(show_boxes, box_format='tlwh')
         >>> from matplotlib import pyplot as plt
         >>> plt.gca().set_xlim(xy.min() - 1, wh.max() / 2 + 1)
         >>> plt.gca().set_ylim(xy.min() - 1, wh.max() / 2 + 1)
@@ -1147,10 +1148,10 @@ def initial_imagenet_weights():
 
 
 def demo_image(inp_size):
-    from netharn import util
+    import kwimage
     import numpy as np
     import cv2
-    rgb255 = util.grab_test_image('astro', 'rgb')
+    rgb255 = kwimage.grab_test_image('astro', 'rgb')
     rgb01 = cv2.resize(rgb255, inp_size).astype(np.float32) / 255
     im_data = torch.FloatTensor([rgb01.transpose(2, 0, 1)])
     return im_data, rgb255
diff --git a/netharn/monitor.py b/netharn/monitor.py
index ff43be1a4b46387881129938fc04668fb9e4db10..34964273a7a595f420956b72d7495732ac44100c 100644
--- a/netharn/monitor.py
+++ b/netharn/monitor.py
@@ -18,7 +18,7 @@ def demodata_monitor():
     n = 300
     losses = (sorted(rng.randint(10, n, size=n)) + rng.randint(0, 20, size=n) - 10)[::-1]
     mious = (sorted(rng.randint(10, n, size=n)) + rng.randint(0, 20, size=n) - 10)
-    monitor = Monitor(minimize=['loss'], maximize=['miou'], smoothing=.6)
+    monitor = Monitor(minimize=['loss'], maximize=['miou'], smoothing=0.0)
     for epoch, (loss, miou) in enumerate(zip(losses, mious)):
         monitor.update(epoch, {'loss': loss, 'miou': miou})
     return monitor
@@ -34,8 +34,7 @@ class Monitor(ub.NiceRepr):
     Attributes:
         minimize (List[str]): measures where a lower is better
         maximize (List[str]): measures where a higher is better
-        smoothing (float): smoothness factor for the moving averages.
-           Currently 0.6, we may change the default to 0.0 in the future.
+        smoothing (float, default=0.0): smoothness factor for moving averages.
         max_epoch (int, default=1000): number of epochs to stop after
         patience (int, default=None): if specified, the number of epochs
             to wait before quiting if the quality metrics are not improving.
@@ -56,7 +55,7 @@ class Monitor(ub.NiceRepr):
         >>> monitor.show()
     """
 
-    def __init__(monitor, minimize=['loss'], maximize=[], smoothing=0.6,
+    def __init__(monitor, minimize=['loss'], maximize=[], smoothing=0.0,
                  patience=None, max_epoch=1000, min_lr=None):
 
         # Internal attributes
@@ -321,7 +320,7 @@ class Monitor(ub.NiceRepr):
 
         Example:
             >>> from netharn.monitor import *
-            >>> monitor = Monitor()
+            >>> monitor = Monitor(smoothing=0.6)
             >>> print(monitor.message(ansi=False))
             vloss is unevaluated
             >>> monitor.update(0, {'loss': 1.0})
@@ -378,8 +377,8 @@ class Monitor(ub.NiceRepr):
             >>> metric_ranks = monitor.best_epochs(5)
             >>> print(ub.repr2(metric_ranks, with_dtype=False, nl=1))
             {
-                'loss': np.array([297, 299, 298, 296, 295]),
-                'miou': np.array([299, 298, 297, 296, 295]),
+                'loss': np.array([297, 296, 299, 295, 298]),
+                'miou': np.array([299, 296, 298, 295, 292]),
             }
         """
         metric_ranks = {}
diff --git a/netharn/output_shape_for.py b/netharn/output_shape_for.py
index 6b8ff71e758a67e8264ea1dd35f2afde6a0ce108..c59850ab429568e18ef584a3316c8d10cef81fdd 100644
--- a/netharn/output_shape_for.py
+++ b/netharn/output_shape_for.py
@@ -1,1094 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import, division, print_function, unicode_literals
-import ubelt as ub
-import math
-import torch
-import torch.nn as nn
-import torchvision
-from collections import OrderedDict
-from six.moves import builtins
-from netharn import analytic_for
-# try:
-from netharn.device import DataSerial
-# except ImportError:
-#     DataSerial = None
-
-REGISTERED_TYPES = []
-
-
-SHAPE_CLS = tuple  # We exepct shapes to be specified as this class
-
-
-def compute_type(*types):
-    def _wrap(func):
-        for type in types:
-            if type is not None:
-                REGISTERED_TYPES.append((type, func))
-        return func
-    return _wrap
-
-
-def output_shape_of(outputs):
-    """
-    Given a network output, try and find the shape. Works in most standard
-    cases, but not all cases.
-
-    Args:
-        outputs (Tensor | Dict | Tuple): some typical torch network output
-
-    Example:
-        >>> output_shape_of(torch.empty(3, 2))
-        (3, 2)
-        >>> output_shape_of({'a': torch.empty(3, 2)})
-        {'a': (3, 2)}
-        >>> output_shape_of(((torch.empty(3, 2),),))
-        [[(3, 2)]]
-    """
-    if torch.is_tensor(outputs):
-        computed_output_shape = SHAPE_CLS(outputs.shape)
-    elif isinstance(outputs, dict):
-        dict_cls = outputs.__class__  # handle odict
-        computed_output_shape = dict_cls([
-            (k, output_shape_of(v)) for k, v in outputs.items()])
-    elif isinstance(outputs, tuple):
-        # Allow outputs to be a tuple of tensors
-        computed_output_shape = [output_shape_of(o) for o in outputs]
-    else:
-        raise TypeError('Cannot find shape of {!r}'.format(type(outputs)))
-    return computed_output_shape
-
-
-def _brute_force_output_shape_for(self, input_shape):
-    """
-    Computes output shape by actually running the network. Works in most
-    standard cases, but not all cases. If the batch size is None, we attempt to
-    be smart about ensuring that that None is propogated in the output.
-
-    Example:
-        >>> module = nn.Conv2d(3, 11, 3, 1, 0)
-        >>> _brute_force_output_shape_for(module, (None, 3, 256, 256))
-        (None, 11, 254, 254)
-    """
-    _input_shape = list(input_shape)
-    unknown_bsize = _input_shape[0] is None
-    if unknown_bsize:
-        bsize = 2
-        _input_shape[0] = bsize
-    device = next(iter(self.state_dict().values())).device
-    dummy_input = torch.rand(*_input_shape).to(device)
-    dummy_output = self(dummy_input)
-    output_shape = output_shape_of(dummy_output)
-    if torch.is_tensor(dummy_output):
-        if unknown_bsize:
-            if output_shape[0] == bsize:
-                output_shape = list(output_shape)
-                output_shape[0] = None
-        output_shape = SHAPE_CLS(output_shape)
-    else:
-        raise NotImplementedError('other output types')
-    return output_shape
-
-
-def _simplify(shape):
-    import sympy
-    if isinstance(shape, (tuple, list)):
-        shape = shape.__class__([_simplify(v) for v in shape])
-    elif isinstance(shape, dict):
-        shape = shape.__class__([(k, _simplify(v)) for k, v in shape.items()])
-    elif isinstance(shape, sympy.Expr):
-        shape = sympy.simplify(shape)
-    return shape
-
-
-class HiddenShapes(analytic_for.Hidden):
-    """
-    Augments normal hidden shape dicts with a convinience setitem
-
-    Doctest:
-        >>> from netharn.output_shape_for import *
-        >>> shape = OutputShape.coerce([None, 3, 32, 32], 'foo')
-        >>> print(HiddenShapes({'e': shape}))
-        <HiddenShapes({'e': 'foo'})>
-        >>> hidden = HiddenShapes({'a': 1})
-        >>> hidden['b'] = 2
-        >>> hidden['c'] = shape
-        >>> print(hidden)
-        <HiddenShapes({'a': 1, 'b': 2, 'c': 'foo'})>
-    """
-    pass
-
-
-# class HiddenShapes(OrderedDict, ub.NiceRepr):
-#     """
-#     Augments normal hidden shape dicts with a convinience setitem
-
-#     Doctest:
-#         >>> from netharn.output_shape_for import *
-#         >>> shape = OutputShape.coerce([None, 3, 32, 32], 'foo')
-#         >>> print(HiddenShapes({'e': shape}))
-#         <HiddenShapes({'e': 'foo'})>
-#         >>> hidden = HiddenShapes({'a': 1})
-#         >>> hidden['b'] = 2
-#         >>> hidden['c'] = shape
-#         >>> print(hidden)
-#         <HiddenShapes({'a': 1, 'b': 2, 'c': 'foo'})>
-#     """
-#     def __nice__(self):
-#         return ub.repr2(self, nl=0)
-
-#     def __str__(self):
-#         return ub.NiceRepr.__str__(self)
-
-#     def __repr__(self):
-#         return ub.NiceRepr.__repr__(self)
-
-#     def __setitem__(self, key, value):
-#         if getattr(value, 'hidden', None) is not None:
-#             # When setting a value to an OutputShape object, if that object has
-#             # a hidden shape, then use that instead.
-#             value = value.hidden
-#         return OrderedDict.__setitem__(self, key, value)
-
-#     def shallow(self, n=1):
-#         """
-#         Grabs only the shallowest n layers of hidden shapes
-#         """
-#         if n == 0:
-#             last = self
-#             while isinstance(last, HiddenShapes):
-#                 values = list(last.values())
-#                 if len(values):
-#                     last = values[-1]
-#                 else:
-#                     break
-#             return last
-#         else:
-#             output = OrderedDict()
-#             for key, value in self.items():
-#                 # if isinstance(value, HiddenShapes):
-#                 if hasattr(value, 'shallow'):
-#                     value = value.shallow(n - 1)
-#                 output[key] = value
-#             return output
-
-
-class OutputShape(analytic_for.Output):
-    """
-    Mixin class to extend output shapes with extra information
-
-    Doctest:
-        >>> from netharn.output_shape_for import *
-        >>> shape = OutputShape.coerce([None, 3, 32, 32], 'foo')
-        >>> print('shape = {!r}'.format(shape))
-        shape = (None, 3, 32, 32)
-        >>> print('shape.hidden = {!r}'.format(shape.hidden))
-        shape.hidden = 'foo'
-    """
-    def __init__(self, data=None, hidden=None):
-        self.data = data
-        self.hidden = hidden
-
-    @classmethod
-    def template(cls, type):
-        """ Get a specific template for a subclass type """
-        if type is tuple:
-            return OutputShapeTuple
-        elif type is OrderedDict:
-            return OutputShapeDict
-        elif type is dict:
-            return OutputShapeDict
-        else:
-            raise TypeError(type)
-
-    @classmethod
-    def coerce(cls, data=None, hidden=None):
-        """
-        Create an OutputShape instance of the approriate subclass given the
-        type of input data.
-        """
-        if isinstance(data, cls):
-            if hidden is None:
-                self = data
-            else:
-                self = data.__class__(data, hidden)
-        elif isinstance(data, (tuple, list)):
-            self = cls.template(tuple)(data, hidden)
-        elif isinstance(data, dict):
-            self = cls.template(dict)(data, hidden)
-        else:
-            raise TypeError(type(data))
-        return self
-
-
-class OutputShapeTuple(tuple, OutputShape):
-    """ OutputShape templated as a tuple """
-    def __new__(cls, data=None, hidden=None):
-        # tuple subclass is a bit weird
-        if data is None:
-            data = tuple()
-        self = tuple.__new__(OutputShapeTuple, data)
-        OutputShape.__init__(self, data, hidden)
-        return self
-
-
-class OutputShapeDict(OrderedDict, OutputShape):
-    """ OutputShape templated as a dictionary """
-    def __init__(self, data=None, hidden=None):
-        if data is None:
-            data = OrderedDict()
-        OrderedDict.__init__(self, data)
-        OutputShape.__init__(self, data, hidden)
-
-
-class OutputShapeFor(analytic_for.OutputFor):
-    """
-    Compute the output shape for standard torch modules as well as
-    any custom modules that follow the OutputShapeFor protocol.
-
-    Notes:
-        The OutputShapeFor protocol is simple. For any custom torch module
-        define the method `output_shape_for(self, input_shape)`, which is
-        typically written to mirror the `forward` function. Instead of calling
-        forward on the custom module's torch members use `OutputShapeFor`. See
-        netharn.layers for more examples of custom layers that implement this
-        protocol. A simple example is shown below.
-
-    Example:
-        >>> # Example showing how to implement the OutputShapeFor protocol
-        >>> class MyCustomNet(nn.Module):
-        >>>     def __init__(self):
-        >>>         super(MyCustomNet, self).__init__()
-        >>>         self.conv1 = nn.Conv2d(1, 5, 3)
-        >>>         self.pool1 = nn.MaxPool2d(2)
-        >>>         self.conv2 = nn.Conv2d(5, 7, 3)
-        >>>     def forward(self, input):
-        >>>         x = input
-        >>>         x = self.conv1(x)
-        >>>         x = self.pool1(x)
-        >>>         x = self.conv2(x)
-        >>>         return x
-        >>>     def output_shape_for(self, input_shape):
-        >>>         x = input_shape
-        >>>         # Note using hidden shapes is optional, but sometimes useful
-        >>>         hidden = HiddenShapes()
-        >>>         # The basic idea is to simply mirror the forward func
-        >>>         # but instead of calling the modules use output shape for
-        >>>         hidden['conv1'] = x = OutputShapeFor(self.conv1)(x)
-        >>>         hidden['pool1'] = x = OutputShapeFor(self.pool1)(x)
-        >>>         hidden['conv2'] = x = OutputShapeFor(self.conv2)(x)
-        >>>         shape = OutputShape.coerce(x, hidden)
-        >>>         return shape
-        >>> net = MyCustomNet()
-        >>> # Now it is very easy and efficient to infer the output shape
-        >>> input_shape = (None, 1, 9, 9)
-        >>> net.output_shape_for(input_shape)
-        (None, 7, 1, 1)
-        >>> # The OutputShapeFor class now recognizes your module as well
-        >>> # so it can be used to constuct more complex modules while
-        >>> # still maintaining the ability fo infer the output shape.
-        >>> OutputShapeFor(net)(input_shape)
-        (None, 7, 1, 1)
-        >>> # Note that if you did return an true OutputShape object with
-        >>> # a populated hidden shape attribute, then you can access it
-        >>> # to inspect how the shape changes in the hidden layer of the net
-        >>> print(OutputShapeFor(net)(input_shape).hidden)
-        <HiddenShapes({'conv1': (None, 5, 7, 7), 'pool1': (None, 5, 3, 3), 'conv2': (None, 7, 1, 1)})>
-
-    Example:
-        >>> # Example showing how this class is used on basic torch Modules
-        >>> module = nn.Conv2d(3, 11, 3, 1, 0)
-        >>> OutputShapeFor(module)((1, 3, 256, 256))
-        (1, 11, 254, 254)
-    """
-    math = math  # for hacking in sympy
-
-    def __init__(self, module, force=False):
-        """
-        Args:
-            module (nn.Module) : module with output_shape_for func or
-                with some known registered type (e.g. torch.nn.Conv2d).
-
-            force (bool): if True and no implicit computation is known
-                try to create a dummy input with input_shape and simply
-                run it through the network to see what shape it produces.
-                (Defaults to False).
-        """
-        self._requires_force = False
-        self.module = module
-        # First try to lookup the output_shape_for func
-        self._func = getattr(module, 'output_shape_for', None)
-
-        if self._func is None:
-            # Lookup shape func if we can't find it
-            found = []
-            for type, _func in REGISTERED_TYPES:
-                try:
-                    if module is type or isinstance(module, type):
-                        found.append(_func)
-                except TypeError:
-                    pass
-            if len(set(found)) == 1:
-                self._func = found[0]
-            elif len(found) == 0:
-                raise TypeError('Unknown (output_shape) module type {}'.format(module))
-            else:
-                raise AssertionError('Ambiguous (output_shape) module {}. Found {}'.format(module, found))
-
-    def __call__(self, *args, **kwargs):
-        if isinstance(self.module, nn.Module):
-            # bound methods dont need module
-            is_bound  = hasattr(self._func, '__func__') and getattr(self._func, '__func__', None) is not None
-            is_bound |= hasattr(self._func, 'im_func') and getattr(self._func, 'im_func', None) is not None
-            if is_bound:
-                output_shape = self._func(*args, **kwargs)
-            else:
-                # nn.Module with state
-                output_shape = self._func(self.module, *args, **kwargs)
-        else:
-            # a simple pytorch func
-            output_shape = self._func(*args, **kwargs)
-
-        # Package the output shape up in the appropriate wrapper class
-        output_shape = OutputShape.coerce(output_shape)
-        # if self.math.__name__ == 'sympy':
-        #     output_shape = _simplify(output_shape)
-        # debug = True
-        # if debug:
-        #     print('{}.output_shape = {}'.format(str(self._func.__name__), output_shape))
-        return output_shape
-
-    def _check_consistency(self, input_shape, **kwargs):
-        """
-        Test function to check that expected shape is equal to computed shape.
-        The kwargs are passed to both output_shape_for and forward, so ensure
-        that both functions accept the same arguments.
-        """
-        # Run the output shape computation
-        expected = self(input_shape, **kwargs)
-
-        if isinstance(expected, OutputShape):
-            expected_output_shape = expected.data
-        else:
-            expected_output_shape = expected
-
-        # Create dummy inputs and send them through the network
-        inputs = torch.randn(input_shape)
-        with torch.no_grad():
-            self.module.eval()
-            outputs = self.module(inputs, **kwargs)
-
-        if isinstance(outputs, dict):
-            if not isinstance(expected_output_shape, dict):
-                raise AssertionError((
-                    'if outputs is a dict, then output_shape must also be '
-                    'a corresponding dict. Instead we got: '
-                    'type(outputs)={} '
-                    'type(expected_output_shape)={} '
-                ).format(type(outputs), type(expected_output_shape)))
-        computed_output_shape = output_shape_of(outputs)
-
-        if computed_output_shape != expected_output_shape:
-            print('expected_output_shape = {}'.format(ub.repr2(expected_output_shape, nl=0)))
-            print('computed_output_shape = {}'.format(ub.repr2(computed_output_shape, nl=0)))
-            raise AssertionError(
-                'computed shape {!r} != expected shape {!r}'.format(
-                    computed_output_shape,
-                    expected_output_shape,
-                )
-            )
-        return expected_output_shape
-
-    @staticmethod
-    @compute_type(nn.Upsample)
-    def Upsample(module, input_shape):
-        r"""
-        - Input: :math:`(N, C, H_{in}, W_{in})`
-        - Output: :math:`(N, C, H_{out}, W_{out})` where
-            :math:`H_{out} = floor(H_{in} * scale\_factor)`
-            :math:`W_{out} = floor(W_{in}  * scale\_factor)`
-
-        Example:
-            >>> # xdoctest: +SKIP
-            >>> # There is a torch bug in 1.1.0 that prevents this from working
-            >>> from netharn.output_shape_for import *
-            >>> input_shape = (1, 3, 256, 256, 256)
-            >>> module = nn.Upsample(scale_factor=(2, 3, 4))
-            >>> output_shape = OutputShapeFor(module)(input_shape)
-            >>> print('output_shape = {!r}'.format(output_shape))
-            output_shape = (1, 3, 512, 768, 1024)
-            >>> module = nn.Upsample(size=100)
-            >>> output_shape = OutputShapeFor(module)(input_shape)
-            >>> print('output_shape = {!r}'.format(output_shape))
-            output_shape = (1, 3, 100, 100, 100)
-            >>> input_shape = (1, 3, 256, 256)
-            >>> module = nn.UpsamplingBilinear2d(scale_factor=2)
-            >>> output_shape = OutputShapeFor(module)(input_shape)
-            >>> print('output_shape = {!r}'.format(output_shape))
-            output_shape = (1, 3, 512, 512)
-        """
-        math = OutputShapeFor.math
-        # N, C, *DIMS_in = input_shape
-        N, C = input_shape[0:2]
-        DIMS_in = input_shape[2:]
-
-        if module.size is None:
-            scale_factor = ensure_iterablen(module.scale_factor, len(DIMS_in))
-            int = builtins.int if math.__name__ == 'math' else ub.identity
-            DIMS_out = [
-                int(math.floor(D_in * scale_factor[i]))
-                for i, D_in in enumerate(DIMS_in)
-            ]
-        else:
-            DIMS_out = ensure_iterablen(module.size, len(DIMS_in))
-
-        output_shape = SHAPE_CLS([N, C] + list(DIMS_out))
-        if math.__name__ == 'sympy':
-            output_shape = _simplify(output_shape)
-        return output_shape
-
-    @staticmethod
-    @compute_type(torch.nn.functional.interpolate)
-    def interpolate(input_shape, size=None, scale_factor=None, **kwargs):
-        """
-        Example:
-            >>> from netharn.output_shape_for import *
-            >>> input_shape = (1, 3, 256, 256)
-            >>> output_shape = OutputShapeFor(torch.nn.functional.interpolate)(input_shape, size=(32, 32))
-            >>> print('output_shape = {!r}'.format(output_shape))
-            output_shape = (1, 3, 32, 32)
-        """
-        math = OutputShapeFor.math
-        # N, C, *DIMS_in = input_shape
-        N, C = input_shape[0:2]
-        DIMS_in = input_shape[2:]
-
-        if size is None:
-            scale_factor = ensure_iterablen(scale_factor, len(DIMS_in))
-            int = builtins.int if math.__name__ == 'math' else ub.identity
-            DIMS_out = [
-                int(math.floor(D_in * scale_factor[i]))
-                for i, D_in in enumerate(DIMS_in)
-            ]
-        else:
-            DIMS_out = ensure_iterablen(size, len(DIMS_in))
-
-        output_shape = SHAPE_CLS([N, C] + list(DIMS_out))
-        if math.__name__ == 'sympy':
-            output_shape = _simplify(output_shape)
-        return output_shape
-
-    @staticmethod
-    @compute_type(nn.ConvTranspose1d)
-    def conv1dT(module, input_shape):
-        return OutputShapeFor.convndT(module, input_shape, 1)
-
-    @staticmethod
-    @compute_type(nn.ConvTranspose2d)
-    def conv2dT(module, input_shape):
-        return OutputShapeFor.convndT(module, input_shape, 2)
-
-    @staticmethod
-    @compute_type(nn.ConvTranspose3d)
-    def conv3dT(module, input_shape):
-        return OutputShapeFor.convndT(module, input_shape, 3)
-
-    @staticmethod
-    @compute_type(nn.Conv1d)
-    def conv1d(module, input_shape):
-        return OutputShapeFor.convnd(module, input_shape, 1)
-
-    @staticmethod
-    @compute_type(nn.Conv2d)
-    def conv2d(module, input_shape):
-        return OutputShapeFor.convnd(module, input_shape, 2)
-
-    @staticmethod
-    @compute_type(nn.Conv3d)
-    def conv3d(module, input_shape):
-        return OutputShapeFor.convnd(module, input_shape, 3)
-
-    @staticmethod
-    @compute_type(nn.MaxPool1d)
-    def maxpool1d(module, input_shape):
-        return OutputShapeFor.maxpoolnd(module, input_shape, 1)
-
-    @staticmethod
-    @compute_type(nn.MaxPool2d)
-    def maxpool2d(module, input_shape):
-        return OutputShapeFor.maxpoolnd(module, input_shape, 2)
-
-    @staticmethod
-    @compute_type(nn.MaxPool3d)
-    def maxpool3d(module, input_shape):
-        return OutputShapeFor.maxpoolnd(module, input_shape, 3)
-
-    @staticmethod
-    @compute_type(nn.AvgPool1d)
-    def avepool1d(module, input_shape):
-        return OutputShapeFor.avepoolnd(module, input_shape, 1)
-
-    @staticmethod
-    @compute_type(nn.AvgPool2d)
-    def avepool2d(module, input_shape):
-        return OutputShapeFor.avepoolnd(module, input_shape, 2)
-
-    @staticmethod
-    @compute_type(nn.AvgPool3d)
-    def avepool3d(module, input_shape):
-        return OutputShapeFor.avepoolnd(module, input_shape, 3)
-
-    @staticmethod
-    @compute_type(nn.modules.pooling._AdaptiveMaxPoolNd, nn.modules.pooling._AdaptiveAvgPoolNd)
-    def adaptive_poolnd(module, input_shape):
-        """
-        Adaptive pooling is easy because the output-shape is known a-priori
-        """
-        B, C = input_shape[0:2]
-        in_dims = input_shape[2:]
-
-        n = len(in_dims)
-        output_dims = ensure_iterablen(module.output_size, n)
-        for i, d in enumerate(output_dims):
-            if d is None:
-                output_dims[i] = in_dims[i]
-
-        output_shape = SHAPE_CLS([B, C] + list(output_dims))
-        return output_shape
-
-    @staticmethod
-    def convndT(module, input_shape, n):
-        r"""
-        - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
-        - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
-            :math:`H_{out} = (H_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0] + output\_padding[0]`
-            :math:`W_{out} = (W_{in} - 1) * stride[1] - 2 * padding[1] + kernel\_size[1] + output\_padding[1]`
-
-        Example:
-            >>> from netharn.output_shape_for import *
-            >>> input_shape = (1, 3, 256, 256)
-            >>> module = nn.ConvTranspose2d(input_shape[1], 11, kernel_size=2, stride=2)
-            >>> output_shape = OutputShapeFor(module)(input_shape)
-            >>> print('output_shape = {!r}'.format(output_shape))
-            output_shape = (1, 11, 512, 512)
-
-        Example:
-            >>> from netharn.output_shape_for import *
-            >>> input_shape = (1, 3, 25, 32, 32)
-            >>> module = nn.Conv3d(in_channels=input_shape[1], out_channels=11,
-            >>>                    kernel_size=(3, 3, 3), stride=1, padding=0,
-            >>>                    dilation=1, groups=1, bias=True)
-            >>> output_shape = OutputShapeFor(module)(input_shape)
-            >>> print('output_shape = {!r}'.format(output_shape))
-            output_shape = (1, 11, 23, 30, 30)
-        """
-        # N, C_in, *DIMS_in = input_shape
-        N, C_in = input_shape[0:2]
-        DIMS_in = input_shape[2:]
-
-        if len(DIMS_in) != n:
-            raise ValueError('must have {} dims, but got {} '.format(n, len(DIMS_in)))
-
-        C_out = module.out_channels
-        stride = module.stride
-        kernel_size = module.kernel_size
-        output_padding = module.output_padding
-        dilation = module.dilation
-
-        padding = module.padding
-        DIMS_out = [
-            # Fix the docs: https://github.com/pytorch/pytorch/issues/14099
-            (D_in - 1) * stride[i] - 2 * padding[i] + (kernel_size[i] - 1) * dilation[i] + output_padding[i] + 1
-            for i, D_in in enumerate(DIMS_in)
-        ]
-        output_shape = SHAPE_CLS([N, C_out] + DIMS_out)
-        if math.__name__ == 'sympy':
-            output_shape = _simplify(output_shape)
-        return output_shape
-
-    @staticmethod
-    def convnd(module, input_shape, n):
-        r"""
-        Notes:
-            - Input: :math:`(N, C_{in}, H_{in}, W_{in})`
-            - Output: :math:`(N, C_{out}, H_{out}, W_{out})` where
-                :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)`
-                :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)`
-
-        Example:
-            >>> from netharn.output_shape_for import *
-            >>> input_shape = (1, 3, 256, 256)
-            >>> module = nn.Conv2d(input_shape[1], 11, 3, 1, 0)
-            >>> output_shape = OutputShapeFor(module)(input_shape)
-            >>> print('output_shape = {!r}'.format(output_shape))
-            output_shape = (1, 11, 254, 254)
-        """
-        math = OutputShapeFor.math
-        # N, C_in, *DIMS_in = input_shape
-        N, C_in = input_shape[0:2]
-        DIMS_in = input_shape[2:]
-
-        if len(DIMS_in) != n:
-            raise ValueError('must have {} dims, but got {} '.format(n, len(DIMS_in)))
-
-        C_out = module.out_channels
-        padding = module.padding
-        stride = module.stride
-        dilation = module.dilation
-        kernel_size = module.kernel_size
-
-        int = builtins.int if math.__name__ == 'math' else ub.identity
-        DIMS_out = [
-            int(math.floor(
-                (D_in + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1
-            ))
-            for i, D_in in enumerate(DIMS_in)
-        ]
-        output_shape = SHAPE_CLS([N, C_out] + DIMS_out)
-        if math.__name__ == 'sympy':
-            output_shape = _simplify(output_shape)
-        return output_shape
-
-    @staticmethod
-    def maxpoolnd(module, input_shape, n):
-        r"""
-        CommandLine:
-            python -m xdoctest netharn.output_shape_for OutputShapeFor.maxpoolnd:0
-
-        Example:
-            >>> from netharn.output_shape_for import *
-            >>> input_shape = (1, 3, 256, 256)
-            >>> module = nn.MaxPool2d(kernel_size=2, stride=2)
-            >>> output_shape = tuple(OutputShapeFor(module)(input_shape))
-            >>> print('output_shape = {!r}'.format(output_shape))
-            output_shape = (1, 3, 128, 128)
-
-        Example:
-            >>> from netharn.output_shape_for import *
-            >>> input_shape = (1, 512, 37, 37)
-            >>> module = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
-            >>> output_shape = tuple(OutputShapeFor(module)(input_shape))
-            >>> print('output_shape = {!r}'.format(output_shape))
-            output_shape = (1, 512, 19, 19)
-
-        Shape:
-            2d Case:
-            Same as conv2 forumla except C2 = C1
-            - Input: :math:`(N, C, H_{in}, W_{in})`
-            - Output: :math:`(N, C, H_{out}, W_{out})` where
-            :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - dilation[0] * (kernel\_size[0] - 1) - 1) / stride[0] + 1)`
-            :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - dilation[1] * (kernel\_size[1] - 1) - 1) / stride[1] + 1)`
-        """
-        math = OutputShapeFor.math
-        # N, C, *DIMS_in = input_shape
-        N, C = input_shape[0:2]
-        DIMS_in = input_shape[2:]
-
-        padding = ensure_iterablen(module.padding, n)
-        stride = ensure_iterablen(module.stride, n)
-        dilation = ensure_iterablen(module.dilation, n)
-        kernel_size = ensure_iterablen(module.kernel_size, n)
-
-        trunc = math.ceil if module.ceil_mode else math.floor
-
-        int = builtins.int if math.__name__ == 'math' else ub.identity
-
-        DIMS_out = [
-            int(trunc((D_in  + 2 * padding[i] - dilation[i] * (kernel_size[i] - 1) - 1) / stride[i] + 1))
-            for i, D_in in enumerate(DIMS_in)
-        ]
-        output_shape = SHAPE_CLS([N, C] + DIMS_out)
-        if math.__name__ == 'sympy':
-            output_shape = _simplify(output_shape)
-        return output_shape
-
-    @staticmethod
-    def avepoolnd(module, input_shape, n):
-        r"""
-        2D case:
-          Shape:
-              - Input: :math:`(N, C, H_{in}, W_{in})`
-              - Output: :math:`(N, C, H_{out}, W_{out})` where
-                :math:`H_{out} = floor((H_{in}  + 2 * padding[0] - kernel\_size[0]) / stride[0] + 1)`
-                :math:`W_{out} = floor((W_{in}  + 2 * padding[1] - kernel\_size[1]) / stride[1] + 1)`
-        """
-        math = OutputShapeFor.math
-        # N, C, *DIMS_in = input_shape
-        N, C = input_shape[0:2]
-        DIMS_in = input_shape[2:]
-
-        padding = ensure_iterablen(module.padding, n)
-        stride = ensure_iterablen(module.stride, n)
-        kernel_size = ensure_iterablen(module.kernel_size, n)
-
-        int = builtins.int if math.__name__ == 'math' else ub.identity
-
-        DIMS_out = [
-            int(math.floor((D_in + 2 * padding[i] - kernel_size[i]) / stride[i] + 1))
-            for i, D_in in enumerate(DIMS_in)
-        ]
-        output_shape = SHAPE_CLS([N, C] + DIMS_out)
-        if math.__name__ == 'sympy':
-            output_shape = _simplify(output_shape)
-        return output_shape
-
-    @staticmethod
-    @compute_type(nn.Linear)
-    def linear(module, input_shape):
-        r"""
-           Shape:
-               - Input: :math:`(N, *, in\_features)` where `*` means any number of
-                 additional dimensions
-               - Output: :math:`(N, *, out\_features)` where all but the last dimension
-                 are the same shape as the input.
-        """
-        # N, *other, in_feat = input_shape
-        N = input_shape[0]
-        other = input_shape[1:-1]
-        in_feat = input_shape[-1]  # NOQA
-
-        output_shape = [N] + list(other) + [module.out_features]
-        return SHAPE_CLS(output_shape)
-
-    @staticmethod
-    def identity(input_shape):
-        return SHAPE_CLS(input_shape)
-
-    @staticmethod
-    @compute_type(nn.functional.relu)
-    def relu_func(input_shape):
-        return SHAPE_CLS(input_shape)
-
-    @staticmethod
-    @compute_type(nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d,
-                  nn.modules.normalization.GroupNorm,
-                  nn.modules.normalization.LocalResponseNorm,
-                  nn.modules.normalization.LayerNorm, nn.CrossMapLRN2d,
-                  nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d)
-    def normalization(module, input_shape):
-        """
-            import redbaron
-            import torch
-            source = open(torch.nn.modules.instancenorm.__file__, 'r').read()
-            baron = redbaron.RedBaron(source)
-            classes = [item.name for item in baron if item.type == 'class']
-            print(', '.join(['nn.{}'.format(c) for c in classes]))
-
-            source = open(torch.nn.modules.normalization.__file__, 'r').read()
-            baron = redbaron.RedBaron(source)
-            classes = [item.name for item in baron if item.type == 'class']
-            print(', '.join(['nn.{}'.format(c) for c in classes]))
-        """
-        return OutputShapeFor.identity(input_shape)
-
-    @staticmethod
-    @compute_type(nn.Dropout, nn.Dropout2d, nn.Dropout3d, nn.AlphaDropout,
-                  nn.FeatureAlphaDropout)
-    def dropout(module, input_shape):
-        return OutputShapeFor.identity(input_shape)
-
-    @staticmethod
-    @compute_type(nn.Threshold, nn.RReLU, nn.Hardtanh, nn.ReLU6, nn.ReLU,
-                  nn.Sigmoid, nn.Tanh, nn.ELU, nn.CELU, nn.SELU, nn.GLU,
-                  nn.Hardshrink, nn.LeakyReLU, nn.LogSigmoid, nn.Softplus,
-                  nn.Softshrink, nn.PReLU, nn.Softsign, nn.Tanhshrink,
-                  nn.Softmin, nn.Softmax, nn.Softmax2d, nn.LogSoftmax)
-    def nonlinearity(module, input_shape):
-        r"""
-        Ignore:
-            import redbaron
-            import torch
-            source = open(torch.nn.modules.activation.__file__, 'r').read()
-            baron = redbaron.RedBaron(source)
-            classes = [item.name for item in baron if item.type == 'class']
-            print(', '.join(['nn.{}'.format(c) for c in classes]))
-        """
-        return OutputShapeFor.identity(input_shape)
-
-    @staticmethod
-    @compute_type(nn.Sequential)
-    def sequential(module, input_shape):
-        """
-        CommandLine:
-            xdoctest -m netharn.output_shape_for OutputShapeFor.sequential
-
-        Example:
-            >>> from netharn.output_shape_for import *
-            >>> self = nn.Sequential(
-            >>>     nn.Conv2d(2, 3, kernel_size=3),
-            >>>     nn.Conv2d(3, 5, kernel_size=3),
-            >>>     nn.Conv2d(5, 7, kernel_size=3),
-            >>> )
-            >>> shape = OutputShapeFor(self)([1, 1, 7, 11])
-            >>> print('shape = {}'.format(ub.repr2(shape, nl=0)))
-            >>> print('shape.hidden = {}'.format(ub.repr2(shape.hidden, nl=1)))
-            shape = (1, 7, 1, 5)
-            shape.hidden = {
-                '0': (1, 3, 5, 9),
-                '1': (1, 5, 3, 7),
-                '2': (1, 7, 1, 5),
-            }
-        """
-        hidden = HiddenShapes()
-        shape = input_shape
-        for key, child in module._modules.items():
-            hidden[key] = shape = OutputShapeFor(child)(shape)
-        shape = OutputShape.coerce(shape, hidden=hidden)
-        return shape
-
-    @staticmethod
-    @compute_type(torchvision.models.resnet.BasicBlock)
-    def resent_basic_block(module, input_shape):
-        residual_shape = input_shape
-        shape = input_shape
-
-        hidden = HiddenShapes()
-        hidden['conv1'] = shape = OutputShapeFor(module.conv1)(shape)
-        hidden['bn1']   = shape = OutputShapeFor(module.bn1)(shape)
-        hidden['relu1'] = shape = OutputShapeFor(module.relu)(shape)
-
-        hidden['conv2'] = shape = OutputShapeFor(module.conv2)(shape)
-        hidden['bn2']   = shape = OutputShapeFor(module.bn2)(shape)
-        hidden['relu2'] = shape = OutputShapeFor(module.relu)(shape)
-
-        if module.downsample is not None:
-            residual_shape = OutputShapeFor(module.downsample)(residual_shape)
-            hidden['residual'] = residual_shape
-
-        hidden['join'] = shape
-        assert residual_shape[-2:] == shape[-2:], (
-            'cannot add residual {} {}'.format(residual_shape, shape))
-        shape = OutputShapeFor(module.relu)(shape)
-        hidden['relu3'] = shape
-        shape = OutputShape.coerce(shape, hidden=hidden)
-        return shape
-
-    @staticmethod
-    @compute_type(torchvision.models.resnet.Bottleneck)
-    def resent_bottleneck(module, input_shape):
-        residual_shape = input_shape
-        shape = input_shape
-
-        hidden = HiddenShapes()
-        hidden['conv1'] = shape = OutputShapeFor(module.conv1)(shape)
-        hidden['bn1']   = shape = OutputShapeFor(module.bn1)(shape)
-        hidden['relu1'] = shape = OutputShapeFor(module.relu)(shape)
-
-        hidden['conv2'] = shape = OutputShapeFor(module.conv2)(shape)
-        hidden['bn2']   = shape = OutputShapeFor(module.bn2)(shape)
-        hidden['relu2'] = shape = OutputShapeFor(module.relu)(shape)
-
-        hidden['conv3'] = shape = OutputShapeFor(module.conv3)(shape)
-        hidden['bn3']   = shape = OutputShapeFor(module.bn3)(shape)
-
-        if module.downsample is not None:
-            residual_shape = OutputShapeFor(module.downsample)(input_shape)
-            hidden['residual'] = residual_shape
-
-        assert residual_shape[-2:] == shape[-2:], (
-            'cannot add residual {} {}'.format(residual_shape, shape))
-        hidden['join'] = shape
-
-        shape = OutputShapeFor(module.relu)(shape)
-        hidden['relu3'] = shape
-
-        shape = OutputShape.coerce(shape, hidden=hidden)
-        return shape
-
-    @staticmethod
-    @compute_type(torchvision.models.resnet.ResNet)
-    def resnet_model(module, input_shape):
-        """
-        Example:
-            >>> # xdoctest: +REQUIRES(--network)
-            >>> from netharn.output_shape_for import *
-            >>> module = torchvision.models.resnet50()
-            >>> input_shape = (1, 3, 224, 224)
-            >>> field = OutputShapeFor(module)(input_shape=input_shape)
-        """
-        shape = input_shape
-
-        hidden = HiddenShapes()
-        hidden['conv1'] = shape = OutputShapeFor(module.conv1)(shape)
-        hidden['bn1'] = shape = OutputShapeFor(module.bn1)(shape)
-        hidden['relu1'] = shape = OutputShapeFor(module.relu)(shape)
-        hidden['maxpool'] = shape = OutputShapeFor(module.maxpool)(shape)
-
-        hidden['layer1'] = shape = OutputShapeFor(module.layer1)(shape)
-        hidden['layer2'] = shape = OutputShapeFor(module.layer2)(shape)
-        hidden['layer3'] = shape = OutputShapeFor(module.layer3)(shape)
-        hidden['layer4'] = shape = OutputShapeFor(module.layer4)(shape)
-
-        hidden['avgpool'] = shape = OutputShapeFor(module.avgpool)(shape)
-
-        def prod(args):
-            result = args[0]
-            for arg in args[1:]:
-                result = result * arg
-            return result
-        shape = (shape[0], prod(shape[1:]))
-        hidden['view'] = shape
-
-        hidden['fc'] = shape = OutputShapeFor(module.fc)(shape)
-        shape = OutputShape.coerce(shape, hidden=hidden)
-        return shape
-
-    @staticmethod
-    @compute_type(nn.functional.adaptive_avg_pool2d)
-    def adaptive_poolnd_func(input_shape, output_shape):
-        """
-        Adaptive pooling is easy because the output-shape is known a-priori
-
-        Example:
-            >>> from netharn.output_shape_for import *
-            >>> input_shape = (1, 3, 256, 256)
-            >>> output_shape = (7, 7)
-            >>> output_shape_ = OutputShapeFor(nn.functional.adaptive_avg_pool2d)(input_shape, output_shape)
-            >>> print('output_shape = {!r}'.format(output_shape_))
-            output_shape = (1, 3, 7, 7)
-        """
-        B, C = input_shape[0:2]
-        in_dims = input_shape[2:]
-
-        n = len(in_dims)
-        output_dims = ensure_iterablen(output_shape, n)
-        for i, d in enumerate(output_dims):
-            if d is None:
-                output_dims[i] = in_dims[i]
-
-        output_shape_ = SHAPE_CLS([B, C] + list(output_dims))
-        return output_shape_
-
-    @staticmethod
-    @compute_type(torch.cat)
-    def cat(input_shapes, dim=0):
-        """
-        Example:
-            >>> from netharn.output_shape_for import *
-            >>> input_shape1 = (1, 3, 256, 256)
-            >>> input_shape2 = (1, 4, 256, 256)
-            >>> input_shapes = [input_shape1, input_shape2]
-            >>> output_shape = OutputShapeFor(torch.cat)(input_shapes, dim=1)
-            >>> print('output_shape = {!r}'.format(output_shape))
-            output_shape = (1, 7, 256, 256)
-        """
-        n_dims = max(map(len, input_shapes))
-        assert n_dims == min(map(len, input_shapes))
-        output_shape = [None] * n_dims
-        for shape in input_shapes:
-            for i, v in enumerate(shape):
-                if output_shape[i] is None:
-                    output_shape[i] = v
-                else:
-                    if i == dim:
-                        output_shape[i] += v
-                    else:
-                        assert output_shape[i] == v, 'inconsistent dims {}'.format(input_shapes)
-        return SHAPE_CLS(output_shape)
-
-    @staticmethod
-    @compute_type(DataSerial)
-    def data_serial(module, *args, **kw):
-        return OutputShapeFor(module.module)(*args, **kw)
-
-    @staticmethod
-    @compute_type(torch.nn.DataParallel)
-    def data_parallel(module, *args, **kw):
-        return OutputShapeFor(module.module)(*args, **kw)
-
-    @staticmethod
-    def getitem(arr):
-        """
-        Wraps getitem calls
-
-        Example:
-            >>> arr = (2, 32, 9, 9)
-            >>> result = OutputShapeFor.getitem(arr)[:, 0:4]
-            >>> assert result == [2, 4, 9, 9]
-        """
-        return _ShapeGetItem(arr)
-
-    @staticmethod
-    def view(arr, *args):
-        """
-        Wraps view calls
-
-        Example:
-            >>> arr = (2, 32, 9, 9)
-            >>> result = OutputShapeFor.view(arr, -1)
-            >>> assert result == (5184,)
-        """
-        from netharn import layers
-        reshape = layers.Reshape(*args)
-        return reshape.output_shape_for(arr)
-
-    @staticmethod
-    def shape(arr):
-        """
-        Wraps shape calls
-
-        Example:
-            >>> arr = (2, 32, 9, 9)
-            >>> result = OutputShapeFor.shape(arr)
-            >>> assert result == arr
-        """
-        return arr
-
-    @staticmethod
-    def add(arr1, arr2):
-        return _output_shape_broadcast(arr1, arr2)
-
-    @staticmethod
-    def mul(arr1, arr2):
-        return _output_shape_broadcast(arr1, arr2)
-
-    @staticmethod
-    def sub(arr1, arr2):
-        return _output_shape_broadcast(arr1, arr2)
-
-    @staticmethod
-    def div(arr1, arr2):
-        return _output_shape_broadcast(arr1, arr2)
-
-
-def _output_shape_broadcast(arr1, arr2):
-    """
-    Args:
-        arr1 (Tuple | scalar): shape of arr1 or a scalar
-        arr2 (Tuple | scalar): shape of arr2 or a scalar
-    """
-    if not ub.iterable(arr1):
-        return arr2
-    if not ub.iterable(arr2):
-        return arr1
-    if tuple(arr1) != tuple(arr2):
-        # TODO: handle broadcast
-        raise NotImplementedError('Full broadcast not implemented {} != {}'.format(arr1, arr2))
-    return arr1
-
-
-class _ShapeGetItem(object):
-    def __init__(self, inp):
-        self.inp = inp
-
-    def __getitem__(self, slices):
-        ellipsis_type = type(Ellipsis)
-        oup = list(self.inp)
-        if isinstance(slices, slice):
-            slices = (slices,)
-
-        if isinstance(slices, tuple):
-            for i, sl in enumerate(slices):
-                if isinstance(sl, ellipsis_type):
-                    assert i == len(slices) - 1
-                    break
-                start, stop, step = sl.indices(oup[i])
-                oup[i] = (stop - start) // step
-        return oup
-
-
-def ensure_iterablen(scalar, n):
-    try:
-        iter(scalar)
-    except TypeError:
-        return [scalar] * n
-    return scalar
+import warnings
+warnings.warn('Deprecated file. Use netharn.analytic.output_shape_for instead', UserWarning)
+from netharn.analytic.output_shape_for import *  # NOQA
diff --git a/netharn/prefit/lr_tests.py b/netharn/prefit/lr_tests.py
index dc2ca6c3cbcc0bd3fccf66710a58e4fd1a5855ae..da91694011a39f547053ef50541e0470a2c9ab16 100644
--- a/netharn/prefit/lr_tests.py
+++ b/netharn/prefit/lr_tests.py
@@ -14,11 +14,15 @@ class TestResult(ub.NiceRepr):
 
 
 def lr_range_test(harn, init_value=1e-8, final_value=10., beta=0.98,
-                  explode_factor=10):
+                  explode_factor=10, num_iters=100):
     """
     Implementation of Leslie Smith's LR-range described in [2] test based on
     code found in [1].
 
+    Args:
+        init_value : initial learning rate
+        beta (float): smoothing param
+
     Notes:
         It is critical that `init_value` starts off much lower than the actual
         valid LR-range. This is because this test actually modifies a copy of
@@ -98,7 +102,7 @@ def lr_range_test(harn, init_value=1e-8, final_value=10., beta=0.98,
         tag = 'train'
         loader = harn.loaders[tag]
 
-        num_epochs = min(100, len(loader))
+        num_epochs = min(num_iters, len(loader))
 
         # These are the learning rates we will scan through
         learning_rates = np.logspace(
@@ -149,9 +153,9 @@ def lr_range_test(harn, init_value=1e-8, final_value=10., beta=0.98,
             prog.set_extra(' best_lr={:.2g}, curr_lr={:.2g}, best_loss={:.2f}, curr_loss={:.2f}'.format(best_lr, curr_lr, best_loss, curr_loss))
 
             if bx > 0:
-                # This loss was achived by a step with the previous lr, so ensure
-                # we are associating the correct lr with the loss that corresponds
-                # to it.
+                # This loss was achived by a step with the previous lr, so
+                # ensure we are associating the correct lr with the loss that
+                # corresponds to it.
                 records['loss_std'].append(metrics.std()['loss'])
                 records['loss'].append(curr_loss)
                 records['raw_loss'].append(raw_loss)
diff --git a/netharn/receptive_field_for.py b/netharn/receptive_field_for.py
index 02a14c0ffe8ca74ea16d91575d91a92856f7d16b..b4c88b2300a298b045a4afe2bf53bacebc7efb1f 100644
--- a/netharn/receptive_field_for.py
+++ b/netharn/receptive_field_for.py
@@ -1,1128 +1,3 @@
-# -*- coding: utf-8 -*-
-from __future__ import absolute_import, division, print_function, unicode_literals
-import torch
-import copy
-import six  # NOQA
-import torch.nn as nn
-import torchvision
-import ubelt as ub
-import numpy as np
-from collections import OrderedDict
-from netharn.output_shape_for import OutputShapeFor
-from netharn import analytic_for
-# try:
-# from netharn.device import MountedModel
-# except ImportError:
-#     MountedModel = None
-
-REGISTERED_TYPES = []
-
-
-def ensure_array_nd(data, n):
-    if ub.iterable(data):
-        return np.array(data)
-    else:
-        return np.array([data] * n)
-
-
-def compute_type(*types):
-    def _wrap(func):
-        for type in types:
-            if type is not None:
-                REGISTERED_TYPES.append((type, func))
-        return func
-    return _wrap
-
-
-class ReceptiveFieldTypeError(TypeError):
-    pass
-
-
-class ReceptiveField(OrderedDict, analytic_for.Output):
-    """
-    container for holding a receptive feild
-
-    Example:
-        >>> self = ReceptiveField.coerce({
-        >>>     'stride': np.array([4]),
-        >>>     'shape': np.array([1]),
-        >>>     'crop': np.array([0]),
-        >>> })
-        >>> self_copy = copy.deepcopy(self)
-    """
-    def __init__(self, data, hidden=None):
-        # Inheriting from an odict consistently between python 2/3 is weird
-        data2 = OrderedDict(sorted(OrderedDict(data).items()))
-        OrderedDict.__init__(self, data2)
-        self.data = data2
-        self.hidden = hidden
-
-    def __copy__(self):
-        self_copy = ReceptiveField(self.data, self.hidden)
-        return self_copy
-
-    def __deepcopy__(self, memo):
-        data_copy = copy.deepcopy(self.data, memo)
-        hidden_copy = copy.deepcopy(self.hidden, memo)
-        self_copy = ReceptiveField(data_copy, hidden_copy)
-        return self_copy
-
-    @classmethod
-    def coerce(cls, data, hidden=None):
-        """
-        Example:
-            >>> # test weird python2 failure case
-            >>> from netharn.receptive_field_for import *
-            >>> cls = ReceptiveField
-            >>> data = [(0, ReceptiveFieldFor.input())]
-            >>> self = cls.coerce(data)
-            >>> print(ub.repr2(self, with_dtype=False))
-            {
-                0: {
-                    'crop': np.array([0., 0.]),
-                    'shape': np.array([1., 1.]),
-                    'stride': np.array([1., 1.]),
-                },
-            }
-        """
-        # TODO: make this work like OutputShape
-        if data is None:
-            self = ReceptiveFieldFor.input()
-            self.hidden = hidden
-        elif isinstance(data, cls):
-            if hidden is None:
-                self = data
-            else:
-                self = data.__class__(data, hidden)
-        else:
-            self = cls(data, hidden)
-        return self
-
-    # def __getitem__(self, key):
-    #     return self.data[key]
-
-
-class HiddenFields(analytic_for.Hidden):
-    """
-    Augments normal hidden fields dicts with a convinience setitem
-    """
-    pass
-
-
-class _TorchMixin(object):
-    """
-    Receptive field formulas for PyTorch primatives
-    """
-
-    @staticmethod
-    def input(input_field=None, n=2):
-        """
-        Basic input receptive field is just a single pixel.
-        """
-        if input_field is not None:
-            raise ValueError('nothing can precede the input')
-        input_field = ReceptiveField.coerce({
-            # The input receptive field stride / scale factor is 1.
-            'stride': ensure_array_nd(1.0, n),
-            # The input receptive field shape is 1 pixel.
-            'shape': ensure_array_nd(1.0, n),
-            # Use the coordinate system where the top left corner is 0, 0 ( This is unlike [1], which uses 0.5)
-            'crop': ensure_array_nd(0.0, n),
-        })
-        return input_field
-
-    @staticmethod
-    def _kernelized(module, input_field=None, ndim=None):
-        """
-        Receptive field formula for general sliding kernel based layers
-        This works for both convolutional and pooling layers.
-
-        Notes:
-            Baseline formulas are from [1]. Information about how to include
-            dilation (atrous) convolutions can be found in [2, 3].  Better info
-            seems to be available in [4].
-
-            * tensorflow has similar functionality
-            https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/receptive_field/python/util/receptive_field.py
-
-            * To preserve spatial extent, padding should equal `(k - 1) * d / 2`.
-
-        References:
-            [1] https://medium.com/mlreview/a-guide-to-receptive-field-arithmetic-for-convolutional-neural-networks-e0f514068807
-            [2] http://www.erogol.com/dilated-convolution/
-            [3] https://stackoverflow.com/questions/35582521/how-to-calculate-receptive-field-shape
-            [4] https://arxiv.org/pdf/1603.07285.pdf
-
-        Example:
-            >>> module = nn.Conv2d(1, 1, kernel_size=5, stride=2, padding=2, dilation=3)
-            >>> field = ReceptiveFieldFor._kernelized(module)
-            >>> print(ub.repr2(field, nl=0, with_dtype=False))
-            {'crop': np.array([4., 4.]), 'shape': np.array([13., 13.]), 'stride': np.array([2., 2.])}
-
-            >>> module = nn.MaxPool2d(kernel_size=3, stride=2, padding=2, dilation=2)
-            >>> field = ReceptiveFieldFor._kernelized(module)
-            >>> print(ub.repr2(field, nl=0, with_dtype=False))
-            {'crop': np.array([0., 0.]), 'shape': np.array([5., 5.]), 'stride': np.array([2., 2.])}
-
-            >>> module = nn.MaxPool2d(kernel_size=3, stride=2, padding=2, dilation=1)
-            >>> field = ReceptiveFieldFor._kernelized(module)
-            >>> print(ub.repr2(field, nl=0, with_dtype=False))
-            {'crop': np.array([-1., -1.]), 'shape': np.array([3., 3.]), 'stride': np.array([2., 2.])}
-
-            >>> module = nn.AvgPool2d(kernel_size=3, stride=2, padding=2)
-            >>> field = ReceptiveFieldFor._kernelized(module)
-            >>> print(ub.repr2(field, nl=0, with_dtype=False))
-            {'crop': np.array([-1., -1.]), 'shape': np.array([3., 3.]), 'stride': np.array([2., 2.])}
-        """
-        # impl = ReceptiveFieldFor.impl
-        if input_field is None:
-            input_field = ReceptiveFieldFor.input()
-
-        # Hack to get the number of space-time dimensions
-        if ndim is None:
-            try:
-                if module.__class__.__name__.endswith('1d'):
-                    ndim = 1
-                elif module.__class__.__name__.endswith('2d'):
-                    ndim = 2
-                elif module.__class__.__name__.endswith('3d'):
-                    ndim = 3
-            except AttributeError:
-                if module.__name__.endswith('1d'):
-                    ndim = 1
-                elif module.__name__.endswith('2d'):
-                    ndim = 2
-                elif module.__name__.endswith('3d'):
-                    ndim = 3
-        if ndim is None:
-            raise ValueError('Cannot infer ndim from {}'.format(module))
-
-        k = ensure_array_nd(module.kernel_size, ndim)
-        s = ensure_array_nd(module.stride, ndim)
-        p = ensure_array_nd(module.padding, ndim)
-        d = ensure_array_nd(getattr(module, 'dilation', 1), ndim)
-
-        # To calculate receptive feild we first need to find the SUPPORT of
-        # this layer. The support is the number/extent of extra surrounding
-        # pixels adding this layer will take into account. Given this, we can
-        # compute the receptive feild wrt the original input by combining this
-        # information with the previous receptive feild.
-        #
-        # In the normal case (with no dilation, d=1) the support is (k - 1).
-        # This is because because the operation is able to see a window of shape
-        # k in the input, and produces a single output pixel (hence the k). The
-        # center input pixel corresponds with the output, so it does not expand
-        # the receptive feild (hence the -1), but all other input pixels do
-        # expand the field (thus the k-1).
-        #
-        # The stride of this layer will not affect the support.
-        #
-        # The dilation of the current layer DOES impact the support.
-        # This expands the effective kernel shape, but it does cause the data
-        # each operation sees to become more diffuse. However, even though what
-        # it sees in that extent is more diffuse, the RF is just a bound, so we
-        # can ignore the diffuseness effect and simply scale the input kernel
-        # shape by the dilation amount. Hense we get
-        support = (k - 1) * d
-
-        """
-        Note the above is correct because:
-
-            import sympy as sym
-            k, d = sym.symbols('k, d')
-
-            # Compute the support from formula in 5.1 of [4]
-            # To understand the relationship tying the dilation rate d and the
-            # output shape o, it is useful to think of the impact of d on the
-            # effective kernel shape. A kernel of shape k dilated by a factor d
-            # has an effective shape.
-            effective_kernel_size = k + (k - 1) * (d - 1)
-            support_v1 = sym.expand(effective_kernel_size - 1)
-
-            # Compute support from our method
-            support_v2 = sym.expand((k - 1) * d)
-
-            # They are equivalent. QED
-            assert sym.Eq(support_v1, support_v2)
-        """
-
-        # Compute how many pixels this layer takes off the side Note that an
-        # even shape kernel results in half pixel crops.  This is expected and
-        # correct. To use the crop in practice take the floor / ceil of the
-        # final result, but in this intermediate stage, subpixel crops are
-        # perfectly valid.
-        crop = ((support / 2.0) - p)
-
-        field = ReceptiveField.coerce({
-            # The new stride only depends on the layer stride and the previous
-            # stride.
-            'stride': input_field['stride'] * s,
-
-            # The stride of the current layer does not impact the receptive
-            # feild, however the stride of the previous layer does. This is
-            # because each pixel in the incoming layer really corresponds
-            # `input_field['stride']` pixels in the original input.
-            'shape':   input_field['shape'] + support * input_field['stride'],
-
-            # Padding does not influence the RF shape, but it does influence
-            # where the start pixel is (i.e. without the right amount of
-            # padding the the edge of the previous layer is cropped).
-            'crop': input_field['crop'] + crop * input_field['stride'],
-        })
-        return field
-
-    @staticmethod
-    def _unchanged(module, input_field=None):
-        """ Formula for layers that do not change the receptive field """
-        if input_field is None:
-            input_field = ReceptiveFieldFor.input()
-        return input_field
-
-    @staticmethod
-    @compute_type(nn.Linear)
-    def linear(module, input_field=None):
-        # Linear layers (sort-of) dont change the RF
-        return ReceptiveFieldFor._unchanged(module, input_field)
-        # Perhaps we could do this if we knew the input shape
-        # raise NotImplementedError(
-        #     'Cannot compute receptive field shape on a Linear layer')
-
-    @staticmethod
-    def _kernelized_tranpose(module, input_field=None):
-        """
-        Receptive field formula for pooling layers
-
-        Example:
-            >>> from netharn.receptive_field_for import *
-            >>> from netharn.output_shape_for import *
-            >>> module = nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, padding=2)
-            >>> ReceptiveFieldFor(module)()
-
-            >>> # This network should effectively invert itself
-            >>> module = nn.Sequential(ub.odict([
-            >>>     #('a', nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=1)),
-            >>>     ('c1', nn.Conv2d(1, 1, kernel_size=3, stride=2)),
-            >>>     ('c2', nn.Conv2d(1, 1, kernel_size=3, stride=2)),
-            >>>     ('c3', nn.Conv2d(1, 1, kernel_size=3, stride=2)),
-            >>>     ('c3T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2)),
-            >>>     ('c2T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2)),
-            >>>     ('c1T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2)),
-            >>> ]))
-            >>> print(ub.repr2(ReceptiveFieldFor(module)()))
-            >>> ReceptiveFieldFor(module)()
-            >>> OutputShapeFor(module)._check_consistency([1, 1, 32, 32])
-
-            >>> module = nn.Sequential(ub.odict([
-            >>>     #('a', nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=1)),
-            >>>     ('c1', nn.Conv2d(1, 1, kernel_size=3, stride=2, dilation=2)),
-            >>>     ('c2', nn.Conv2d(1, 1, kernel_size=3, stride=2, dilation=2)),
-            >>>     ('c3', nn.Conv2d(1, 1, kernel_size=3, stride=2, dilation=2)),
-            >>>     ('c3T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, dilation=2)),
-            >>>     ('c2T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, dilation=2)),
-            >>>     ('c1T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, dilation=2)),
-            >>> ]))
-            >>> print(ub.repr2(ReceptiveFieldFor(module)()))
-
-            >>> # This network is pathological
-            >>> module = nn.Sequential(ub.odict([
-            >>>     #('a', nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=1)),
-            >>>     ('c1', nn.Conv2d(1, 1, kernel_size=3, stride=7, dilation=2)),
-            >>>     ('c2', nn.Conv2d(1, 1, kernel_size=5, stride=6, padding=1)),
-            >>>     ('c3', nn.Conv2d(1, 1, kernel_size=7, stride=5)),
-            >>>     ('c3T', nn.ConvTranspose2d(1, 1, kernel_size=7, stride=6)),
-            >>>     ('c2T', nn.ConvTranspose2d(1, 1, kernel_size=5, stride=7, padding=1)),
-            >>>     ('c1T', nn.ConvTranspose2d(1, 1, kernel_size=3, stride=8, dilation=2)),
-            >>> ]))
-            >>> print(ub.repr2(ReceptiveFieldFor(module)()))
-            >>> ReceptiveFieldFor(module)()
-            >>> OutputShapeFor(module)([1, 1, 900, 900])
-            >>> OutputShapeFor(module)([1, 1, 900, 900]).hidden
-            >>> OutputShapeFor(module)._check_consistency([1, 1, 900, 900])
-
-            >>> module = nn.Sequential(
-            >>>     nn.Conv2d(1, 1, kernel_size=3, stride=2),
-            >>>     nn.Conv2d(1, 1, kernel_size=3, stride=2),
-            >>>     nn.Conv2d(1, 1, kernel_size=3, stride=2),
-            >>>     nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2),
-            >>>     nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2),
-            >>>     nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2),
-            >>> )
-            >>> ReceptiveFieldFor(module)()
-
-            >>> module = nn.Conv2d(1, 1, kernel_size=3, stride=2, padding=1)
-            >>> ReceptiveFieldFor(module)()
-
-            >>> OutputShapeFor(nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, padding=0, output_padding=(1, 1)))._check_consistency([1, 1, 1, 1])
-
-            >>> # Figure 4.4
-            >>> OutputShapeFor(nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=2))([1, 1, 5, 5])
-            >>> OutputShapeFor(nn.ConvTranspose2d(1, 1, kernel_size=3, stride=1, padding=2))._check_consistency([1, 1, 5, 5])
-            >>> OutputShapeFor(nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=0))([1, 1, 7, 7])
-
-            >>> # Figure 4.5
-            >>> OutputShapeFor(nn.ConvTranspose2d(1, 1, kernel_size=3, stride=2, padding=0))._check_consistency([1, 1, 5, 5])
-            >>> OutputShapeFor(nn.Conv2d(1, 1, kernel_size=3, stride=1, padding=0))([1, 1, 7, 7])
-
-            >>> ReceptiveFieldFor(module)()
-        """
-        # impl = ReceptiveFieldFor.impl
-        if input_field is None:
-            input_field = ReceptiveFieldFor.input()
-
-        # Hack to get the number of space-time dimensions
-        ndim = None
-        try:
-            if module.__name__.endswith('1d'):
-                ndim = 1
-            elif module.__name__.endswith('2d'):
-                ndim = 2
-            elif module.__name__.endswith('3d'):
-                ndim = 3
-        except AttributeError:
-            pass
-
-        if ndim is None:
-            if hasattr(module, '_dim'):
-                ndim = module._dim
-
-        # A non-trivial transpose convolution should:
-        # * decrease the stride (because the stride is fractional)
-        # the padding has to be equal to the shape of the kernel minus one
-        """
-        From [4]:
-
-        A convolution described by k, s and p has an associated transposed convolution described by:
-        * k' = k,
-        * s' = 1,
-        * p' = k - p - 1,
-        * i' = the shape of the stretched input obtained by adding s − 1 zeros
-            between each input unit,
-        * a = (i + 2p − k) % s, represents the number of zeros added to the
-         bottom and right edges of the input,
-
-         And has output shape:
-             o' = s(i' - 1) + a + k - 2p
-
-        For convT it is always the case that s'=1, howver, note that s' is not
-        what we use to compute the new stride of the output, because that is
-        actually a fractional stride.
-        """
-
-        # Definitions:
-        # In the following comments we discuss 3 distinct layers
-        # (1) The original convolution (conv)
-        # (2) The transpose convolution that inverts the original (convT)
-        # (3) The regular convolution that is equivalent to the transpose
-        # convolution given a specially transformed input tensor (convE)
-
-        # The parameters of a convT are actually the parameters of conv, the
-        # convolution we are trying to "undo", but we will refer to them as
-        # parameters of convT (because they are that as well).
-        k_ = ensure_array_nd(module.kernel_size, ndim)
-        s_ = ensure_array_nd(module.stride, ndim)
-        p_ = ensure_array_nd(module.padding, ndim)
-        d_ = ensure_array_nd(getattr(module, 'dilation', 1), ndim)
-
-        # TODO: incorporate output padding and right-side padding / cropping
-        # Note: output padding does not impact the receptive field, however it
-        # does cause some "right-side" croping, which we are not computing here
-        # yet.
-
-        out_pad = ensure_array_nd(module.output_padding, ndim)  # NOQA
-        # if not np.all(out_pad == 0):
-        #     raise NotImplementedError('cannot handle nonzero output_padding yet')
-
-        # Howver, there is an equivalent way of forumulating a convT as convE:
-        # a regular conv applied on a specially padded input tensor.
-        # The parameters that define convE are:
-        k = k_
-        d = d_
-        s = 1  # stride is always 1 because of the special input transform
-        # p = k_ - p_ - 1  # NOTE: original formula likely assumed dilation=1
-        p = (k_ - 1) * d_ - p_
-
-        # In order for convE to be equivalent to convT, we need to apply convE
-        # to a specially transformed (padded) input tensor.
-        # The padding applied to the input tensor puts extra zeros between each
-        # row/col. The number of extra zeros is the stride of the convT - 1.
-        # The left and right sides of the input tensor are also padded but that
-        # wont factor into the RF calculation.
-        extra_zeros = s_ - 1
-        # This means that the effective support added to the RF shape by convE
-        # will be less than it normally would because we don't count the extra
-        # zeros in our transformed input as real pixels.
-        effective_support = (k - 1 - extra_zeros) * d
-        # NOTE; if the stride is larger than the kernel, some output pixels
-        # will actually just be zeros and have no receptive feild.
-        effective_support = np.maximum(0, effective_support)
-
-        # This special input transform also has the effect of decreasing the RF
-        # stride.  Transpose conv are sometimes called fractional-stride
-        # convolutions This is because they have an effective stride of 1 / s_
-        effective_stride = 1 / s_
-
-        # We calculate the support of convE as if were applied to a normal
-        # input tensor in order to calculate how the start (top-left) pixel
-        # position is modified.
-        support = (k - 1) * d
-
-        # After transformation the effective stride of the input is
-        effective_input_stride = input_field['stride'] * effective_stride
-
-        # how many pixels does this layer crop off the sides of the input
-        crop = ((support / 2) - p)
-
-        # print('effective_support = {!r}'.format(effective_support))
-
-        field = ReceptiveField.coerce({
-            # The new stride only depends on the layer stride and the previous
-            # stride.
-            'stride': effective_input_stride * s,
-
-            # The stride of the current layer does not impact the receptive
-            # feild, however the stride of the previous layer does. This is
-            # because each pixel in the incoming layer really corresponds
-            # `input_field['stride']` pixels in the original input.
-            'shape':   input_field['shape'] + effective_support * input_field['stride'],
-
-            # Padding does not influence the RF shape, but it does influence
-            # where the start pixel is (i.e. without the right amount of
-            # padding the the edge of the previous layer is cropped).
-            'crop': input_field['crop'] + crop * effective_input_stride,
-        })
-
-        return field
-        # raise NotImplementedError('todo')
-
-    @compute_type(nn.modules.conv._ConvTransposeMixin)
-    def convT(module, input_field=None):
-        return ReceptiveFieldFor._kernelized_tranpose(module, input_field)
-
-    @compute_type(nn.modules.conv.Conv1d, nn.modules.conv.Conv2d, nn.modules.conv.Conv3d)
-    def convnd(module, input_field=None):
-        return ReceptiveFieldFor._kernelized(module, input_field)
-
-    @staticmethod
-    @compute_type(nn.modules.pooling._MaxPoolNd)
-    def maxpoolnd(module, input_field=None):
-        return ReceptiveFieldFor._kernelized(module, input_field)
-
-    @staticmethod
-    @compute_type(nn.modules.pooling._AvgPoolNd)
-    def avepoolnd(module, input_field=None):
-        return ReceptiveFieldFor._kernelized(module, input_field)
-
-    @staticmethod
-    @compute_type(nn.modules.pooling._AdaptiveMaxPoolNd, nn.modules.pooling._AdaptiveAvgPoolNd)
-    def adaptive_avepoolnd(module, input_field=None):
-        """
-        it is not possible to analytically compute an adaptive receptive field.
-
-        References:
-            https://forums.fast.ai/t/ideas-behind-adaptive-max-pooling/12634/3
-            https://arxiv.org/abs/1406.4729
-        """
-        raise Exception('not possible to compute adaptive RF without knowning the input_shape ahead of time')
-        # return ReceptiveFieldFor._kernelized(module, input_field)
-
-    @staticmethod
-    @compute_type(nn.ReLU)
-    def relu(module, input_field=None):
-        return ReceptiveFieldFor._unchanged(module, input_field)
-
-    @staticmethod
-    @compute_type(nn.ReLU6, nn.PReLU, nn.LeakyReLU, nn.ELU, nn.CELU, nn.SELU)
-    def _unchanged_activation(module, input_field=None):
-        return ReceptiveFieldFor._unchanged(module, input_field)
-
-    @staticmethod
-    @compute_type(nn.functional.relu, nn.functional.relu6)
-    def _unchanged_activation_func(input_field=None):
-        # return ReceptiveFieldFor._unchanged(module, input_field)
-        return ReceptiveFieldFor._unchanged(None, input_field)
-
-    @staticmethod
-    @compute_type(nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d,
-                  nn.modules.normalization.GroupNorm,
-                  nn.modules.normalization.LocalResponseNorm,
-                  nn.modules.normalization.LayerNorm, nn.CrossMapLRN2d,
-                  nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d)
-    def normalization(module, input_field=None):
-        return ReceptiveFieldFor._unchanged(module, input_field)
-
-    @staticmethod
-    @compute_type(nn.modules.dropout._DropoutNd)
-    def dropout(module, input_field=None):
-        return ReceptiveFieldFor._unchanged(module, input_field)
-
-    @staticmethod
-    @compute_type(nn.Sequential)
-    def sequential(module, input_field=None):
-        """
-        Example:
-            >>> import netharn as nh
-            >>> self = nn.Sequential(
-            >>>     nn.Conv2d(2, 3, kernel_size=3),
-            >>>     nn.Conv2d(3, 5, kernel_size=3),
-            >>>     nn.Conv2d(5, 7, kernel_size=3),
-            >>> )
-            >>> rfield = nh.ReceptiveFieldFor(self)()
-            >>> print('rfield = {}'.format(ub.repr2(rfield, nl=1, with_dtype=False)))
-            rfield = {
-                'crop': np.array([3., 3.]),
-                'shape': np.array([7., 7.]),
-                'stride': np.array([1., 1.]),
-            }
-        """
-        if input_field is None:
-            input_field = ReceptiveFieldFor.input()
-        rfield = input_field
-        hidden = HiddenFields()
-        iter_ = iter(module._modules.items())
-        for key, child in module._modules.items():
-            key, child = next(iter_)
-            if hasattr(child, 'receptive_field_for'):
-                rfield = hidden[key] = child.receptive_field_for(rfield)
-            else:
-                rfield = hidden[key] = ReceptiveFieldFor(child)(rfield)
-        rfield = ReceptiveField.coerce(rfield)
-        rfield.hidden = hidden
-        return rfield
-
-    @staticmethod
-    @compute_type(torch.nn.DataParallel)
-    def data_parallel(module, *args, **kw):
-        return ReceptiveFieldFor(module.module)(*args, **kw)
-
-
-class _TorchvisionMixin(object):
-    """
-    Compute receptive fields for components of torchvision models
-    """
-
-    @staticmethod
-    @compute_type(torchvision.models.resnet.BasicBlock)
-    def resent_basic_block(module, input_field=None):
-        """
-        Example:
-            >>> # xdoctest: +REQUIRES(--network)
-            >>> import torchvision  # NOQA
-            >>> module = torchvision.models.resnet18().layer1[0]
-            >>> field = ReceptiveFieldFor(module)()
-            >>> print(ub.repr2(field.hidden, nl=1, with_dtype=False))
-            {
-                'conv1': {'crop': np.array([0., 0.]), 'shape': np.array([3., 3.]), 'stride': np.array([1., 1.])},
-                'bn1': {'crop': np.array([0., 0.]), 'shape': np.array([3., 3.]), 'stride': np.array([1., 1.])},
-                'relu1': {'crop': np.array([0., 0.]), 'shape': np.array([3., 3.]), 'stride': np.array([1., 1.])},
-                'conv2': {'crop': np.array([0., 0.]), 'shape': np.array([5., 5.]), 'stride': np.array([1., 1.])},
-                'bn2': {'crop': np.array([0., 0.]), 'shape': np.array([5., 5.]), 'stride': np.array([1., 1.])},
-                'relu2': {'crop': np.array([0., 0.]), 'shape': np.array([5., 5.]), 'stride': np.array([1., 1.])},
-            }
-        """
-        if input_field is None:
-            input_field = ReceptiveFieldFor.input()
-        hidden = HiddenFields()
-
-        rfield = input_field
-
-        rfield = hidden['conv1'] = ReceptiveFieldFor(module.conv1)(rfield)
-        rfield = hidden['bn1'] = ReceptiveFieldFor(module.bn1)(rfield)
-        rfield = hidden['relu1'] = ReceptiveFieldFor(module.relu)(rfield)
-
-        rfield = hidden['conv2'] = ReceptiveFieldFor(module.conv2)(rfield)
-        rfield = hidden['bn2'] = ReceptiveFieldFor(module.bn2)(rfield)
-        rfield = hidden['relu2'] = ReceptiveFieldFor(module.relu)(rfield)
-
-        if module.downsample is not None:
-            hidden['downsample'] = ReceptiveFieldFor(module.downsample)(input_field)
-
-        rfield = ReceptiveFieldFor(module.relu)(rfield)
-        rfield.hidden = hidden
-        return rfield
-
-    @staticmethod
-    @compute_type(torchvision.models.resnet.Bottleneck)
-    def resent_bottleneck(module, input_field=None):
-        """
-        CommandLine:
-            xdoctest -m netharn.receptive_field_for _TorchvisionMixin.resent_bottleneck --network
-
-        Example:
-            >>> # xdoctest: +REQUIRES(--network)
-            >>> import torchvision  # NOQA
-            >>> module = torchvision.models.resnet50().layer1[0]
-            >>> field = ReceptiveFieldFor(module)()
-            >>> print(ub.repr2(field.hidden.shallow(1), nl=1, with_dtype=False))
-            {
-                'conv1': {'crop': ...([0., 0.]), 'shape': ...([1., 1.]), 'stride': ...([1., 1.])},
-                'bn1': {'crop': ...([0., 0.]), 'shape': ...([1., 1.]), 'stride': ...([1., 1.])},
-                'relu1': {'crop': ...([0., 0.]), 'shape': ...([1., 1.]), 'stride': ...([1., 1.])},
-                'conv2': {'crop': ...([0., 0.]), 'shape': ...([3., 3.]), 'stride': ...([1., 1.])},
-                'bn2': {'crop': ...([0., 0.]), 'shape': ...([3., 3.]), 'stride': ...([1., 1.])},
-                'relu2': {'crop': ...([0., 0.]), 'shape': ...([3., 3.]), 'stride': ...([1., 1.])},
-                'conv3': {'crop': ...([0., 0.]), 'shape': ...([3., 3.]), 'stride': ...([1., 1.])},
-                'bn3': {'crop': ...([0., 0.]), 'shape': ...([3., 3.]), 'stride': ...([1., 1.])},
-                'downsample': {'crop': ...([0., 0.]), 'shape': ...([1., 1.]), 'stride': ...([1., 1.])},
-            }
-        """
-        if input_field is None:
-            input_field = ReceptiveFieldFor.input()
-        rfield = input_field
-        hidden = HiddenFields()
-
-        rfield = hidden['conv1'] = ReceptiveFieldFor(module.conv1)(rfield)
-        rfield = hidden['bn1'] = ReceptiveFieldFor(module.bn1)(rfield)
-        rfield = hidden['relu1'] = ReceptiveFieldFor(module.relu)(rfield)
-
-        rfield = hidden['conv2'] = ReceptiveFieldFor(module.conv2)(rfield)
-        rfield = hidden['bn2'] = ReceptiveFieldFor(module.bn2)(rfield)
-        rfield = hidden['relu2'] = ReceptiveFieldFor(module.relu)(rfield)
-
-        rfield = hidden['conv3'] = ReceptiveFieldFor(module.conv3)(rfield)
-        rfield = hidden['bn3'] = ReceptiveFieldFor(module.bn3)(rfield)
-
-        if module.downsample is not None:
-            hidden['downsample'] = ReceptiveFieldFor(module.downsample)(input_field)
-
-        rfield = ReceptiveFieldFor(module.relu)(rfield)
-        rfield.hidden = hidden
-        return rfield
-
-    @staticmethod
-    @compute_type(torchvision.models.resnet.ResNet)
-    def resnet_model(module, input_field=None, input_shape=None):
-        """
-        CommandLine:
-            xdoctest -m netharn.receptive_field_for _TorchvisionMixin.resnet_model --network
-
-        Example:
-            >>> # DISABLE_DOCTEST
-            >>> # Note: newest torchvision breaks this
-            >>> # xdoctest: +REQUIRES(--network)
-            >>> from netharn.receptive_field_for import *
-            >>> module = torchvision.models.resnet50()
-            >>> input_shape = (1, 3, 224, 224)
-            >>> field = ReceptiveFieldFor(module)(input_shape=input_shape)
-            >>> print(ub.repr2(field.hidden.shallow(1), nl=1, with_dtype=False))
-            {
-                'conv1': {'crop': ...([0., 0.]), 'shape': ...([7., 7.]), 'stride': ...([2., 2.])},
-                'bn1': {'crop': ...([0., 0.]), 'shape': ...([7., 7.]), 'stride': ...([2., 2.])},
-                'relu1': {'crop': ...([0., 0.]), 'shape': ...([7., 7.]), 'stride': ...([2., 2.])},
-                'maxpool': {'crop': ...([0., 0.]), 'shape': ...([11., 11.]), 'stride': ...([4., 4.])},
-                'layer1': {'crop': ...([0., 0.]), 'shape': ...([35., 35.]), 'stride': ...([4., 4.])},
-                'layer2': {'crop': ...([0., 0.]), 'shape': ...([91., 91.]), 'stride': ...([8., 8.])},
-                'layer3': {'crop': ...([0., 0.]), 'shape': ...([267., 267.]), 'stride': ...([16., 16.])},
-                'layer4': {'crop': ...([0., 0.]), 'shape': ...([427., 427.]), 'stride': ...([32., 32.])},
-                'avgpool': {'crop': ...([96., 96.]), 'shape': ...([619., 619.]), 'stride': ...([32., 32.])},
-                'flatten': {'crop': ...([96., 96.]), 'shape': ...([811., 811.]), 'stride': ...([32., 32.])},
-                'fc': {'crop': ...([96., 96.]), 'shape': ...([811., 811.]), 'stride': ...([32., 32.])},
-            }
-
-        """
-        if input_field is None:
-            input_field = ReceptiveFieldFor.input()
-        rfield = input_field
-        hidden = HiddenFields()
-        rfield = hidden['conv1'] = ReceptiveFieldFor(module.conv1)(rfield)
-        rfield = hidden['bn1'] = ReceptiveFieldFor(module.bn1)(rfield)
-        rfield = hidden['relu1'] = ReceptiveFieldFor(module.relu)(rfield)
-        rfield = hidden['maxpool'] = ReceptiveFieldFor(module.maxpool)(rfield)
-
-        rfield = hidden['layer1'] = ReceptiveFieldFor(module.layer1)(rfield)
-        rfield = hidden['layer2'] = ReceptiveFieldFor(module.layer2)(rfield)
-        rfield = hidden['layer3'] = ReceptiveFieldFor(module.layer3)(rfield)
-        rfield = hidden['layer4'] = ReceptiveFieldFor(module.layer4)(rfield)
-
-        rfield = hidden['avgpool'] = ReceptiveFieldFor(module.avgpool)(rfield)
-
-        if input_shape is None:
-            raise ValueError('input shape is required')
-
-        output_shape = OutputShapeFor(module)(input_shape)
-        avgpool_shape = output_shape.hidden.shallow(1)['layer4']
-        spatial_shape = np.array(avgpool_shape[2:])
-
-        # Keep everything the same except increase the RF shape
-        # based on how many output pixels there are.
-        rfield_flatten = ReceptiveField.coerce(dict(**rfield))
-        # not sure if this is 100% correct
-        rfield_flatten['shape'] = rfield['shape'] + (spatial_shape - 1) * rfield['stride']
-        rfield = hidden['flatten'] = rfield_flatten
-
-        # The reshape operation will blend the receptive fields of the inputs
-        # but it will depend on the output shape of the layer.
-        # rfield = (rfield[0], prod(rfield[1:]))
-
-        rfield = hidden['fc'] = ReceptiveFieldFor(module.fc)(rfield)
-        rfield.hidden = hidden
-        return rfield
-
-
-class ReceptiveFieldFor(analytic_for.OutputFor, _TorchMixin, _TorchvisionMixin):
-    """
-    Knows how to compute the receptive fields for many pytorch primatives and
-    some torchvision components.
-
-    References:
-        https://medium.com/mlreview/a-guide-to-receptive-field-arithmetic-for-convolutional-neural-networks-e0f514068807
-
-    Returns:
-        Tuple[object, Dict]:
-            fields: object: The hidden layer recepvive fields (can be complex due to nesting)
-            field: Dict: a dictionary containing receptive field information.
-
-    Notes:
-        A 1-D Pixel
-            +-----+
-            ^  ^  ^
-          left |  L right
-               |
-             center
-
-    Example:
-        >>> # Case where we have a registered func
-        >>> from netharn.receptive_field_for import *
-        >>> self = nn.Sequential(
-        >>>     nn.Conv2d(2, 3, kernel_size=3),
-        >>>     nn.Conv2d(3, 5, kernel_size=3),
-        >>> )
-        >>> rfield = ReceptiveFieldFor(self)()
-        >>> print('rfield.hidden = {}'.format(ub.repr2(rfield.hidden, nl=3, with_dtype=False)))
-        >>> print('rfield = {}'.format(ub.repr2(rfield, nl=1, with_dtype=False)))
-        rfield.hidden = {
-            '0': {
-                'crop': np.array([1., 1.]),
-                'shape': np.array([3., 3.]),
-                'stride': np.array([1., 1.]),
-            },
-            '1': {
-                'crop': np.array([2., 2.]),
-                'shape': np.array([5., 5.]),
-                'stride': np.array([1., 1.]),
-            },
-        }
-        rfield = {
-            'crop': np.array([2., 2.]),
-            'shape': np.array([5., 5.]),
-            'stride': np.array([1., 1.]),
-        }
-
-    Example:
-        >>> # Case where we haven't registered a func
-        >>> self = nn.Conv2d(2, 3, kernel_size=3)
-        >>> rfield = ReceptiveFieldFor(self)()
-        >>> print('rfield = {}'.format(ub.repr2(rfield, nl=1, with_dtype=False)))
-        rfield = {
-            'crop': np.array([1., 1.]),
-            'shape': np.array([3., 3.]),
-            'stride': np.array([1., 1.]),
-        }
-
-    Example:
-        >>> # xdoctest: +REQUIRES(--network)
-        >>> import torchvision  # NOQA
-        >>> module = torchvision.models.alexnet().features
-        >>> field = ReceptiveFieldFor(module)()
-        >>> print(ub.repr2(field, nl=1, with_dtype=False))
-        {
-            'crop': np.array([31., 31.]),
-            'shape': np.array([195., 195.]),
-            'stride': np.array([32., 32.]),
-        }
-    """
-    # impl = math  # for hacking in sympy
-
-    def __init__(self, module):
-        self.module = module
-        self._func = getattr(module, 'receptive_field_for', None)
-        if self._func is None:
-            # Lookup rfield func if we can't find it
-            found = []
-            for type, _func in REGISTERED_TYPES:
-                try:
-                    if module is type or isinstance(module, type):
-                        found.append(_func)
-                except TypeError:
-                    pass
-            if len(found) == 1:
-                self._func = found[0]
-            elif len(found) == 0:
-                raise ReceptiveFieldTypeError('Unknown (rf) module type {}'.format(module))
-            else:
-                raise AssertionError('Ambiguous (rf) module {}. Found {}'.format(module, found))
-
-    def __call__(self, *args, **kwargs):
-        if isinstance(self.module, nn.Module):
-            # bound methods dont need module
-            is_bound  = hasattr(self._func, '__func__') and getattr(self._func, '__func__', None) is not None
-            is_bound |= hasattr(self._func, 'im_func') and getattr(self._func, 'im_func', None) is not None
-            if is_bound:
-                rfield = self._func(*args, **kwargs)
-            else:
-                # nn.Module with state
-                rfield = self._func(self.module, *args, **kwargs)
-        else:
-            # a simple pytorch func
-            rfield = self._func(*args, **kwargs)
-
-        rfield = ReceptiveField.coerce(rfield)
-        return rfield
-
-    # @staticmethod
-    # def view(arr, *args):
-    #     """
-    #     Wraps view calls
-
-    #     Example:
-    #         >>> arr = (2, 32, 9, 9)
-    #         >>> result = OutputShapeFor.view(arr, -1)
-    #         >>> assert result == (5184,)
-    #     """
-    #     from netharn import layers
-    #     reshape = layers.Reshape(*args)
-    #     return reshape.output_shape_for(arr)
-
-    # @staticmethod
-    def shape(arr):
-        """
-        Wraps shape calls
-        """
-        raise ReceptiveFieldTypeError('RF is currently unable to inspect output shape')
-
-    @staticmethod
-    def _elementwise(field1, field2):
-        # Combines two receptive fields in an elementwise fashion
-        field = ReceptiveField({
-            'shape': np.maximum(field1['shape'], field2['shape']),
-            'crop': np.maximum(field1['crop'], field2['crop']),
-            'stride': np.maximum(field1['stride'], field2['stride']),
-        })
-        return field
-
-    @staticmethod
-    def add(field1, field2):
-        return ReceptiveFieldFor._elementwise(field1, field2)
-
-    @staticmethod
-    def mul(field1, field2):
-        return ReceptiveFieldFor._elementwise(field1, field2)
-
-    @staticmethod
-    def sub(field1, field2):
-        return ReceptiveFieldFor._elementwise(field1, field2)
-
-    @staticmethod
-    def div(field1, field2):
-        return ReceptiveFieldFor._elementwise(field1, field2)
-
-
-def effective_receptive_feild(module, inputs, output_key=None, sigma=0,
-                              thresh=1.00, ignore_norms=True,
-                              ignore_extra=None):
-    """
-    Empirically measures the effective receptive feild of a network
-
-    Method from [0], implementation loosely based on [1].
-
-    Args:
-        module (torch.nn.Module) : the network
-
-        inputs (torch.nn.Tensor) : the input to the network. Must share the
-            same device as `module`.
-
-        output_key (None | str | Callable): If the network outputs a non-tensor
-            then this should be a function that does postprocessing and returns
-            a relevant Tensor that can be used to compute gradients. If the
-            output is a dictionary then this can also be a string-based key
-            used to lookup the appropriate output.
-
-        sigma (float, default=0): smoothness factor (via gaussian blur)
-
-        thresh (float, default=1.00): only consider this fraction of the
-            data as meaningful (i.e. find the effective RF shape that explains
-            95% of the data). A threshold of 1.0 or greater does nothing.
-
-        ignore_norms (bool, default=True): if True ignores normalization layers
-            like batch and group norm which adds negligable, but non-zero
-            impact everywhere and causes the ERF shape estimation to be
-            dramatically greater than it should be (although the impact still
-            makes sense).
-
-        ignore_extra (List[type], optioanl): if specified, any layer that is a
-            subclass of one of these types is also ignored.
-
-    Returns:
-        dict: containing keys
-            'shape' containing the effective RF shape and
-            'impact' which contains the thresholded distribution
-
-    References:
-        [0] https://arxiv.org/pdf/1701.04128.pdf
-        [1] https://github.com/rogertrullo/Receptive-Field-in-Pytorch/blob/master/compute_RF.py
-
-    Example:
-        >>> from netharn.receptive_field_for import *
-        >>> import torchvision  # NOQA
-        >>> module = nn.Sequential(*[nn.Conv2d(1, 1, 3) for i in range(10)])
-        >>> inputs = torch.rand(1, 1, 200, 200)
-        >>> emperical_field = effective_receptive_feild(module, inputs)
-        >>> theoretic_field = ReceptiveFieldFor(module)()
-        >>> # The emperical results should never be bigger than the theoretical
-        >>> assert np.all(emperical_field['shape'] <= theoretic_field['shape'])
-
-        >>> # xdoctest: +REQUIRES(--slow)
-        >>> module = torchvision.models.alexnet().features
-        >>> inputs = torch.rand(1, 3, 224, 224)
-        >>> emperical_field = effective_receptive_feild(module, inputs)
-        >>> theoretic_field = ReceptiveFieldFor(module)()
-        >>> # The emperical results should never be bigger than the theoretical
-        >>> assert np.all(emperical_field['shape'] <= theoretic_field['shape'])
-
-        >>> # xdoctest: +REQUIRES(--slow)
-        >>> import netharn as nh
-        >>> xpu = nh.XPU.coerce('auto')
-        >>> module = xpu.move(torchvision.models.vgg11_bn().features)
-        >>> inputs = xpu.move(torch.rand(1, 3, 224, 224))
-        >>> emperical_field = effective_receptive_feild(module, inputs)
-        >>> theoretic_field = ReceptiveFieldFor(module)()
-        >>> # The emperical results should never be bigger than the theoretical
-        >>> assert np.all(emperical_field['shape'] <= theoretic_field['shape'])
-
-        >>> # xdoctest: +REQUIRES(--show)
-        >>> import kwplot
-        >>> kwplot.autompl()
-        >>> kwplot.imshow(emperical_field['impact'], doclf=True)
-
-    Ignore:
-        >>> xpu = nh.XPU.coerce('auto')
-        >>> module = xpu.move(torchvision.models.resnet50())
-        >>> inputs = xpu.move(torch.rand(8, 3, 224, 224))
-        >>> emperical_field = effective_receptive_feild(module, inputs)
-        >>> import kwplot
-        >>> kwplot.autompl()
-        >>> kwplot.imshow(emperical_field['impact'], doclf=True)
-    """
-    import netharn as nh
-
-    # zero gradients
-    for p in module.parameters():
-        if p.grad is not None:
-            p.grad.detach_()
-            p.grad.zero_()
-
-    if inputs.grad is not None:
-        inputs.grad.detach_()
-        inputs.grad.zero_()
-
-    inputs.requires_grad = True
-    # if inputs.grad is not None:
-    #     raise ValueError('inputs alread has accumulated gradients')
-
-    # Completely ignore BatchNorm layers as they will give the entire input
-    # some negligable but non-zero effect on the receptive feild.
-    ignored = []
-    if ignore_norms:
-        ignored += [
-            nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d,
-            nn.modules.normalization.GroupNorm,
-            nn.modules.normalization.LocalResponseNorm,
-            nn.modules.normalization.LayerNorm, nn.CrossMapLRN2d,
-            nn.InstanceNorm1d, nn.InstanceNorm2d, nn.InstanceNorm3d,
-            nh.layers.L2Norm,
-        ]
-    if ignore_extra:
-        ignored += ignore_extra
-    with nh.util.IgnoreLayerContext(module, tuple(ignored)):
-        outputs = module(inputs)
-
-    # Note: grab a single (likely FCN) output channel
-    if callable(output_key):
-        output_y = output_key(outputs)
-    elif output_key is None:
-        output_y = outputs
-    else:
-        output_y = outputs[output_key]
-    # elif isinstance(output_key, (six.string_types, int)):
-    # else:
-    #     raise TypeError('output_key={} is not understood'.format(output_key))
-
-    if not isinstance(output_y, torch.Tensor):
-        raise TypeError(
-            'The output is a {}, not a tensor. Please specify '
-            'output_key and ensure it returns a Tensor.'.format(type(outputs)))
-
-    # Note: this still does the right thing if there is no spatial component.
-    # because all outputs are center outputs.
-    center_dims = (np.array(output_y.shape[2:]) // 2).tolist()
-    center_slice = [slice(None), slice(None)] + center_dims
-
-    # We dont need to compute a loss because we can explicitly set gradients.
-    # Yay torch!
-    # Explicilty set ∂l/∂y[:] = 0
-    # Explicilty set ∂l/∂y[center] = 1
-    grad_loss_wrt_y = torch.zeros_like(output_y)
-    grad_loss_wrt_y[...] = 0
-    grad_loss_wrt_y[center_slice] = 1
-
-    # Backpropogate as if the grad of the loss wrt to y[center] was 1.
-    # Note: this can take a long time on the CPU (sometimes?)
-    output_y.backward(gradient=grad_loss_wrt_y)
-
-    # The input gradient is now a measure of how much it can impact the output.
-    impact = inputs.grad.abs()
-
-    # Average the impact over all batches and all channels
-    average_impact = impact.mean(dim=0).mean(dim=0)
-
-    if isinstance(average_impact, torch.Tensor):
-        average_impact = average_impact.data.cpu().numpy()
-
-    idx_nonzeros = np.where(average_impact != 0)
-    rf_bounds = [(0, 0) if len(idx) == 0 else (idx.min(), idx.max()) for idx in idx_nonzeros]
-    rf_shape = [(mx - mn + 1) for mn, mx in rf_bounds]
-    rf_slice = tuple([slice(mn, mx + 1) for mn, mx in rf_bounds])
-
-    # Crop out the average impact zone for visualization
-    # Normalize to have a maximum value of 1.0
-    rf_impact = average_impact[rf_slice]
-    rf_impact /= rf_impact.max()
-
-    rf_impact = torch.FloatTensor(rf_impact)
-    if sigma > 0:
-        # Smooth things out
-        _blur = nh.layers.GaussianBlurNd(dim=1, num_features=1, sigma=sigma)
-        _blur.to(rf_impact.device)
-        rf_impact = _blur(rf_impact[None, None])[0, 0]
-
-    if thresh < 1:
-        density = rf_impact.contiguous().view(-1).cpu().numpy().copy()
-        density.sort()
-        density = density[::-1]
-        # Find the value threshold that explains thresh (e.g. 95%) of the data
-        idx = np.where(density.cumsum() > thresh * density.sum())[0]
-        lowval = float(density[idx[0]])
-
-        effective_impact = rf_impact * (rf_impact > lowval).float()
-        effective_idx_nonzeros = np.where(effective_impact != 0)
-        effective_rf_bounds = [(idx.min(), idx.max()) for idx in effective_idx_nonzeros]
-        effective_shape = [(mx - mn + 1) for mn, mx in effective_rf_bounds]
-    else:
-        effective_impact = rf_impact
-        effective_rf_bounds = rf_shape
-        effective_shape = rf_shape
-
-    emperical_field = {
-        'shape': effective_shape,
-        'impact': effective_impact,
-        'thresh': thresh,
-    }
-    return emperical_field
-
-
-if __name__ == '__main__':
-    """
-    CommandLine:
-        xdoctest -m netharn.receptive_field_for all --network
-    """
-    import xdoctest
-    xdoctest.doctest_module(__file__)
+import warnings
+warnings.warn('Deprecated file. Use netharn.analytic.receptive_field_for instead', UserWarning)
+from netharn.analytic.receptive_field_for import *  # NOQA
diff --git a/netharn/util/__init__.py b/netharn/util/__init__.py
index bda3e9e5f89da76618545c76799245d2ea512776..11aa68fe4d5659fc4df1402a12018c8e8de287f2 100644
--- a/netharn/util/__init__.py
+++ b/netharn/util/__init__.py
@@ -40,14 +40,17 @@ from .util_filesys import (get_file_info,)
 from .util_fname import (align_paths, check_aligned, dumpsafe,
                          shortest_unique_prefixes, shortest_unique_suffixes,)
 from .util_idstr import (compact_idstr, make_idstr, make_short_idstr,)
+from .util_inspect import (default_kwargs,)
 from .util_io import (read_arr, read_h5arr, write_arr, write_h5arr,)
 from .util_iter import (roundrobin,)
-from .util_json import (LossyJSONEncoder, NumpyEncoder, read_json, walk_json,
+from .util_json import (LossyJSONEncoder, NumpyEncoder,
+                        ensure_json_serializable, read_json, walk_json,
                         write_json,)
-from .util_misc import (SupressPrint, FlatIndexer, strip_ansi)
+from .util_misc import (FlatIndexer, SupressPrint, align, align_lines,
+                        strip_ansi,)
 from .util_resources import (ensure_ulimit,)
-from .util_slider import (SlidingIndexDataset, SlidingSlices, SlidingWindow,
-                          Stitcher,)
+from .util_slider import (SlidingWindow, Stitcher,)
+from .util_slider_dep import (SlidingIndexDataset, SlidingSlices,)
 from .util_subextreme import (argsubmax, argsubmaxima,)
 from .util_tensorboard import (read_tensorboard_scalars,)
 from .util_torch import (BatchNormContext, DisableBatchNorm,
@@ -62,83 +65,100 @@ from kwarray import (ArrayAPI, DataFrameArray, DataFrameLight, LocLight,
                      group_consecutive_indices, group_indices, group_items,
                      isect_flags, iter_reduce_ufunc, maxvalue_assignment,
                      mincost_assignment, mindist_assignment, one_hot_embedding,
-                     random_combinations, random_product, seed_global, shuffle,
-                     standard_normal, standard_normal32, standard_normal64,
-                     stats_dict, uniform, uniform32,)
-from kwimage import (Boxes, Coords, Detections, Heatmap, Mask, MaskList,
-                     MultiPolygon, Points, PointsList, Polygon, PolygonList,
+                     one_hot_lookup, random_combinations, random_product,
+                     seed_global, shuffle, standard_normal, standard_normal32,
+                     standard_normal64, stats_dict, uniform, uniform32,)
+from kwimage import (BASE_COLORS, Boxes, CSS4_COLORS, Color, Coords,
+                     Detections, Heatmap, Mask, MaskList, MultiPolygon, Points,
+                     PointsList, Polygon, PolygonList, Segmentation,
+                     SegmentationList, TABLEAU_COLORS,
+                     TORCH_GRID_SAMPLE_HAS_ALIGN, XKCD_COLORS,
                      atleast_3channels, available_nms_impls,
                      convert_colorspace, daq_spatial_nms, decode_run_length,
-                     draw_boxes_on_image, draw_text_on_image,
-                     encode_run_length, ensure_alpha_channel, ensure_float01,
-                     ensure_uint255, gaussian_patch, grab_test_image,
-                     grab_test_image_fpath, imread, imscale, imwrite,
-                     make_channels_comparable, non_max_supression,
+                     draw_boxes_on_image, draw_clf_on_image,
+                     draw_text_on_image, draw_vector_field, encode_run_length,
+                     ensure_alpha_channel, ensure_float01, ensure_uint255,
+                     fourier_mask, gaussian_patch, grab_test_image,
+                     grab_test_image_fpath, imread, imresize, imscale, imwrite,
+                     load_image_shape, make_channels_comparable, make_heatmask,
+                     make_orimask, make_vector_field, non_max_supression,
                      num_channels, overlay_alpha_images, overlay_alpha_layers,
-                     rle_translate, smooth_prob, stack_images,
-                     stack_images_grid, subpixel_accum, subpixel_align,
-                     subpixel_getvalue, subpixel_maximum, subpixel_minimum,
-                     subpixel_set, subpixel_setvalue, subpixel_slice,
-                     subpixel_translate, warp_points, warp_tensor,)
+                     radial_fourier_mask, rle_translate, smooth_prob,
+                     stack_images, stack_images_grid, subpixel_accum,
+                     subpixel_align, subpixel_getvalue, subpixel_maximum,
+                     subpixel_minimum, subpixel_set, subpixel_setvalue,
+                     subpixel_slice, subpixel_translate, warp_points,
+                     warp_tensor,)
 from kwplot import (Color, PlotNums, autompl, autoplt, distinct_colors,
-                    distinct_markers, draw_boxes, draw_clf_on_image,
-                    draw_line_segments, ensure_fnum, figure, imshow, legend,
-                    make_conv_images, make_heatmask, make_orimask,
-                    make_vector_field, multi_plot, next_fnum,
-                    plot_convolutional_features, plot_matrix, plot_surface3d,
-                    set_figtitle, set_mpl_backend, show_if_requested,)
+                    distinct_markers, draw_boxes, draw_boxes_on_image,
+                    draw_clf_on_image, draw_line_segments, draw_text_on_image,
+                    ensure_fnum, figure, imshow, legend, make_conv_images,
+                    make_heatmask, make_orimask, make_vector_field, multi_plot,
+                    next_fnum, plot_convolutional_features, plot_matrix,
+                    plot_surface3d, set_figtitle, set_mpl_backend,
+                    show_if_requested,)
 
-__all__ = ['ArrayAPI', 'BatchNormContext', 'Boxes', 'CacheStamp', 'Color',
-           'Coords', 'CumMovingAve', 'DataFrameArray', 'DataFrameLight',
-           'Detections', 'DisableBatchNorm', 'ExpMovingAve', 'Heatmap',
+__all__ = ['ArrayAPI', 'BASE_COLORS', 'BatchNormContext', 'Boxes',
+           'CSS4_COLORS', 'CacheStamp', 'Color', 'Color', 'Coords',
+           'CumMovingAve', 'DataFrameArray', 'DataFrameLight', 'Detections',
+           'DisableBatchNorm', 'ExpMovingAve', 'FlatIndexer', 'Heatmap',
            'IS_PROFILING', 'IgnoreLayerContext', 'InternalRunningStats',
-           'LocLight', 'LossyJSONEncoder', 'Mask', 'MaskList', 'ModuleMixin',
-           'MovingAve', 'MultiPolygon', 'NumpyEncoder', 'PlotNums', 'Points',
-           'PointsList', 'Polygon', 'PolygonList', 'RunningStats',
+           'LocLight', 'LossyJSONEncoder', 'Mask', 'MaskList',
+           'ModuleMixin', 'MovingAve', 'MultiPolygon', 'NumpyEncoder',
+           'PlotNums', 'Points', 'PointsList', 'Polygon', 'PolygonList',
+           'RunningStats', 'Segmentation', 'SegmentationList',
            'SlidingIndexDataset', 'SlidingSlices', 'SlidingWindow', 'Stitcher',
-           'SupressPrint', 'WindowedMovingAve', 'absdev', 'adjust_gamma',
-           'adjust_subplots', 'aggensure', 'align_paths', 'apply_grouping',
-           'arglexmax', 'argmaxima', 'argminima', 'argsubmax', 'argsubmaxima',
-           'atleast_3channels', 'atleast_nd', 'autompl', 'autoplt',
-           'available_nms_impls', 'axes_extent', 'boolmask', 'check_aligned',
-           'colorbar', 'colorbar_image', 'compact_idstr', 'convert_colorspace',
+           'SupressPrint', 'TABLEAU_COLORS', 'TORCH_GRID_SAMPLE_HAS_ALIGN',
+           'WindowedMovingAve', 'XKCD_COLORS', 'absdev', 'adjust_gamma',
+           'adjust_subplots', 'aggensure', 'align', 'align_lines',
+           'align_paths', 'apply_grouping', 'arglexmax', 'argmaxima',
+           'argminima', 'argsubmax', 'argsubmaxima', 'atleast_3channels',
+           'atleast_nd', 'autompl', 'autoplt', 'available_nms_impls',
+           'axes_extent', 'boolmask', 'check_aligned', 'colorbar',
+           'colorbar_image', 'compact_idstr', 'convert_colorspace',
            'copy_figure_to_clipboard', 'daq_spatial_nms', 'dataframe_light',
-           'decode_run_length', 'distinct_colors', 'distinct_markers',
-           'draw_border', 'draw_boxes', 'draw_boxes_on_image',
+           'decode_run_length', 'default_kwargs', 'distinct_colors',
+           'distinct_markers', 'draw_border', 'draw_boxes',
+           'draw_boxes_on_image', 'draw_boxes_on_image', 'draw_clf_on_image',
            'draw_clf_on_image', 'draw_line_segments', 'draw_text_on_image',
-           'dumpsafe', 'encode_run_length', 'ensure_alpha_channel',
-           'ensure_float01', 'ensure_fnum', 'ensure_grayscale', 'ensure_rng',
-           'ensure_uint255', 'ensure_ulimit', 'extract_axes_extents', 'figure',
-           'freeze_params', 'gaussian_patch', 'get_file_info',
-           'get_num_channels', 'grab_test_image', 'grab_test_image_fpath',
-           'grad_context', 'group_consecutive', 'group_consecutive_indices',
-           'group_indices', 'group_items', 'image_slices', 'imread', 'imscale',
+           'draw_text_on_image', 'draw_vector_field', 'dumpsafe',
+           'encode_run_length', 'ensure_alpha_channel', 'ensure_float01',
+           'ensure_fnum', 'ensure_grayscale', 'ensure_json_serializable',
+           'ensure_rng', 'ensure_uint255', 'ensure_ulimit',
+           'extract_axes_extents', 'figure', 'fourier_mask', 'freeze_params',
+           'gaussian_patch', 'get_file_info', 'get_num_channels',
+           'grab_test_image', 'grab_test_image_fpath', 'grad_context',
+           'group_consecutive', 'group_consecutive_indices', 'group_indices',
+           'group_items', 'image_slices', 'imread', 'imresize', 'imscale',
            'imshow', 'imwrite', 'interpolated_colormap', 'isect_flags',
            'iter_reduce_ufunc', 'legend', 'load_image_paths',
-           'make_channels_comparable', 'make_conv_images', 'make_heatmask',
-           'make_idstr', 'make_legend_img', 'make_orimask', 'make_short_idstr',
-           'make_vector_field', 'maxvalue_assignment', 'mincost_assignment',
-           'mindist_assignment', 'multi_plot', 'next_fnum',
-           'non_max_supression', 'num_channels', 'number_of_parameters',
-           'one_hot_embedding', 'one_hot_embedding', 'one_hot_lookup',
-           'overlay_alpha_images', 'overlay_alpha_layers', 'overlay_colorized',
-           'pandas_plot_matrix', 'plot_convolutional_features', 'plot_matrix',
-           'plot_surface3d', 'profile', 'profile_now', 'profiler', 'qtensure',
-           'random_combinations', 'random_product', 'read_arr', 'read_h5arr',
-           'read_json', 'read_tensorboard_scalars', 'render_figure_to_image',
-           'reverse_colormap', 'rle_translate', 'roundrobin', 'save_parts',
-           'savefig2', 'scores_to_cmap', 'scores_to_color', 'seed_global',
-           'set_figtitle', 'set_mpl_backend', 'shortest_unique_prefixes',
-           'shortest_unique_suffixes', 'show_if_requested', 'shuffle',
-           'smooth_prob', 'split_archive', 'stack_images', 'stack_images_grid',
-           'standard_normal', 'standard_normal32', 'standard_normal64',
-           'stats_dict', 'subpixel_accum', 'subpixel_align',
+           'load_image_shape', 'make_channels_comparable', 'make_conv_images',
+           'make_heatmask', 'make_heatmask', 'make_idstr', 'make_legend_img',
+           'make_orimask', 'make_orimask', 'make_short_idstr',
+           'make_vector_field', 'make_vector_field', 'maxvalue_assignment',
+           'mincost_assignment', 'mindist_assignment', 'multi_plot',
+           'next_fnum', 'non_max_supression', 'num_channels',
+           'number_of_parameters', 'one_hot_embedding', 'one_hot_embedding',
+           'one_hot_lookup', 'one_hot_lookup', 'overlay_alpha_images',
+           'overlay_alpha_layers', 'overlay_colorized', 'pandas_plot_matrix',
+           'plot_convolutional_features', 'plot_matrix', 'plot_surface3d',
+           'profile', 'profile_now', 'profiler', 'qtensure',
+           'radial_fourier_mask', 'random_combinations', 'random_product',
+           'read_arr', 'read_h5arr', 'read_json', 'read_tensorboard_scalars',
+           'render_figure_to_image', 'reverse_colormap', 'rle_translate',
+           'roundrobin', 'save_parts', 'savefig2', 'scores_to_cmap',
+           'scores_to_color', 'seed_global', 'set_figtitle', 'set_mpl_backend',
+           'shortest_unique_prefixes', 'shortest_unique_suffixes',
+           'show_if_requested', 'shuffle', 'smooth_prob', 'split_archive',
+           'stack_images', 'stack_images_grid', 'standard_normal',
+           'standard_normal32', 'standard_normal64', 'stats_dict',
+           'strip_ansi', 'subpixel_accum', 'subpixel_align',
            'subpixel_getvalue', 'subpixel_maximum', 'subpixel_minimum',
            'subpixel_set', 'subpixel_setvalue', 'subpixel_slice',
            'subpixel_translate', 'torch_ravel_multi_index', 'trainable_layers',
            'uniform', 'uniform32', 'util_dataframe', 'walk_json',
            'warp_points', 'warp_tensor', 'wide_strides_1d', 'write_arr',
-           'write_h5arr', 'write_json', 'zopen', 'FlatIndexer', 'strip_ansi']
+           'write_h5arr', 'write_json', 'zopen']
 # </AUTOGEN_INIT>
 
 
diff --git a/netharn/util/layer_rotation.py b/netharn/util/layer_rotation.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2021bf5c5000ca2aada59f52ec36f411f26a045
--- /dev/null
+++ b/netharn/util/layer_rotation.py
@@ -0,0 +1,78 @@
+"""
+Implementation of
+Layer rotation: a surprisingly powerful indicator of generalization in deep
+networks?
+
+References:
+    https://arxiv.org/pdf/1806.01603.pdf
+    https://github.com/vfdev-5/LayerRotation-pytorch/blob/master/code/handlers/layer_rotation.py
+"""
+import ubelt as ub
+import numpy as np
+import torch
+
+
+def _get_named_params(model, copy=False):
+    """
+    Example:
+        >>> import netharn as nh
+        >>> model = nh.models.ToyNet2d()
+        >>> dict(_get_named_params(model)).keys()
+    """
+    def fn(p):
+        p = p.cpu().detach()
+        if copy:
+            p = p.clone()
+        return p
+    named_params = [
+        (key, fn(p)) for key, p in model.named_parameters()
+        if 'weight' in key
+    ]
+    return named_params
+
+
+def layer_rotation(current_params, init_params):
+    """
+    Example:
+        >>> import netharn as nh
+        >>> model = nh.models.ToyNet2d()
+        >>> model2 = nh.models.ToyNet2d()
+        >>> init_params = _get_named_params(model)
+        >>> current_params = _get_named_params(model2)
+        >>> ret = layer_rotation(current_params, init_params)
+
+    """
+    ret = []
+    for (n1, p1), (n2, p2) in zip(current_params, init_params):
+        assert n1 == n2, "{} vs {}".format(n1, n2)
+        sim = torch.cosine_similarity(p1.reshape(-1), p2.reshape(-1), dim=0).item()
+        dist = 1.0 - sim
+        ret.append((n1, dist))
+    return ret
+
+
+class LayerRotation(ub.NiceRepr):
+    """
+    Example:
+        >>> import netharn as nh
+        >>> model = nh.models.ToyNet2d()
+        >>> self = LayerRotation(model)
+        >>> self.measure()
+        >>> nh.initializers.KaimingNormal()(self.model)
+        >>> self.measure()
+    """
+    def __init__(self, model):
+        self.model = model
+        self.init_params = _get_named_params(model, copy=True)
+        self.stats = None
+
+    def __nice__(self):
+        return ub.repr2(self.stats)
+
+    def measure(self):
+        import kwarray
+        current_params = _get_named_params(self.model)
+        ret = layer_rotation(current_params, self.init_params)
+        values = np.array([v for n, v in ret])
+        self.stats = kwarray.stats_dict(values, median=True)
+        return self.stats
diff --git a/netharn/util/util_averages.py b/netharn/util/util_averages.py
index e2400b14a210021a6d59f5858dfab6e56bdadac8..9057e972df699a86197de46de29564be21230145 100644
--- a/netharn/util/util_averages.py
+++ b/netharn/util/util_averages.py
@@ -329,13 +329,14 @@ class ExpMovingAve(MovingAve):
         return self
 
 
-class RunningStats(object):
+class RunningStats(ub.NiceRepr):
     """
     Dynamically records per-element array statistics and can summarized them
     per-element, across channels, or globally.
 
     TODO:
         - [ ] This may need a few API tweaks and good documentation
+        - [ ] Move to kwarray
 
     SeeAlso:
         InternalRunningStats
@@ -363,6 +364,16 @@ class RunningStats(object):
         run.raw_squares = 0
         run.n = 0
 
+    def __nice__(self):
+        return '{}'.format(self.shape)
+
+    @property
+    def shape(run):
+        try:
+            return run.raw_total.shape
+        except Exception:
+            return None
+
     def update(run, img):
         run.n += 1
         # Update stats across images
diff --git a/netharn/util/util_inspect.py b/netharn/util/util_inspect.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffa451290d04e5100a23885b17534bad6dae1ed8
--- /dev/null
+++ b/netharn/util/util_inspect.py
@@ -0,0 +1,44 @@
+import six
+
+
+def default_kwargs(cls):
+    """
+    Grab initkw defaults from the constructor
+
+    Args:
+        cls (type | callable): a class or function
+
+    Example:
+        >>> from netharn.util.util_inspect import *  # NOQA
+        >>> import netharn as nh
+        >>> import torch
+        >>> import ubelt as ub
+        >>> cls = torch.optim.Adam
+        >>> default_kwargs(cls)
+        >>> cls = nh.initializers.KaimingNormal
+        >>> print(ub.repr2(default_kwargs(cls), nl=0))
+        {'mode': 'fan_in', 'param': 0}
+        >>> cls = nh.initializers.NoOp
+        >>> default_kwargs(cls)
+        {}
+
+    SeeAlso:
+        xinspect.get_func_kwargs(cls)
+    """
+    if six.PY2:
+        if cls.__init__ is object.__init__:
+            # hack for python2 classes without __init__
+            return {}
+        else:
+            import funcsigs
+            sig = funcsigs.signature(cls)
+    else:
+        import inspect
+        sig = inspect.signature(cls)
+
+    default_kwargs = {
+        k: p.default
+        for k, p in sig.parameters.items()
+        if p.default is not p.empty
+    }
+    return default_kwargs
diff --git a/netharn/util/util_json.py b/netharn/util/util_json.py
index 3584676642746049c62271f3db3ac6ff4151244d..34aab8ab6d65ddd9635f7cb3758625ac61c117f0 100644
--- a/netharn/util/util_json.py
+++ b/netharn/util/util_json.py
@@ -1,9 +1,12 @@
 # -*- coding: utf-8 -*-
 from __future__ import absolute_import, division, print_function, unicode_literals
+import copy
 import json
 import six
+import torch
 import numpy as np
 import ubelt as ub
+from collections import OrderedDict
 
 
 def walk_json(node):
@@ -123,3 +126,136 @@ def read_json(fpath):
         return json.load(open(fpath, 'r'))
     else:
         return json.load(fpath)
+
+
+def ensure_json_serializable(dict_, normalize_containers=False, verbose=0):
+    """
+    Attempt to convert common types (e.g. numpy) into something json complient
+
+    Convert numpy and tuples into lists
+
+    Args:
+        normalize_containers (bool, default=False):
+            if True, normalizes dict containers to be standard python
+            structures.
+
+    Example:
+        >>> from netharn.hyperparams import *  # NOQA
+        >>> data = ub.ddict(lambda: int)
+        >>> data['foo'] = ub.ddict(lambda: int)
+        >>> data['bar'] = np.array([1, 2, 3])
+        >>> data['foo']['a'] = 1
+        >>> data['foo']['b'] = torch.FloatTensor([1, 2, 3])
+        >>> result = ensure_json_serializable(data, normalize_containers=True)
+        >>> assert type(result) is dict
+    """
+    dict_ = copy.deepcopy(dict_)
+
+    def _norm_container(c):
+        if isinstance(c, dict):
+            # Cast to a normal dictionary
+            if isinstance(c, OrderedDict):
+                if type(c) is not OrderedDict:
+                    c = OrderedDict(c)
+            else:
+                if type(c) is not dict:
+                    c = dict(c)
+        return c
+
+    # inplace convert any ndarrays to lists
+    def _walk_json(data, prefix=[]):
+        items = None
+        if isinstance(data, list):
+            items = enumerate(data)
+        elif isinstance(data, tuple):
+            items = enumerate(data)
+        elif isinstance(data, dict):
+            items = data.items()
+        else:
+            raise TypeError(type(data))
+
+        root = prefix
+        level = {}
+        for key, value in items:
+            level[key] = value
+
+        # yield a dict so the user can choose to not walk down a path
+        yield root, level
+
+        for key, value in level.items():
+            if isinstance(value, (dict, list, tuple)):
+                path = prefix + [key]
+                for _ in _walk_json(value, prefix=path):
+                    yield _
+
+    def _convert(dict_, root, key, new_value):
+        d = dict_
+        for k in root:
+            d = d[k]
+        d[key] = new_value
+
+    def _flatmap(func, data):
+        if isinstance(data, list):
+            return [_flatmap(func, item) for item in data]
+        else:
+            return func(data)
+
+    to_convert = []
+    for root, level in ub.ProgIter(_walk_json(dict_), desc='walk json',
+                                   verbose=verbose):
+        for key, value in level.items():
+            if isinstance(value, tuple):
+                # Convert tuples on the fly so they become mutable
+                new_value = list(value)
+                _convert(dict_, root, key, new_value)
+            elif isinstance(value, np.ndarray):
+                new_value = value.tolist()
+                if 0:
+                    if len(value.shape) == 1:
+                        if value.dtype.kind in {'i', 'u'}:
+                            new_value = list(map(int, new_value))
+                        elif value.dtype.kind in {'f'}:
+                            new_value = list(map(float, new_value))
+                        elif value.dtype.kind in {'c'}:
+                            new_value = list(map(complex, new_value))
+                        else:
+                            pass
+                    else:
+                        if value.dtype.kind in {'i', 'u'}:
+                            new_value = _flatmap(int, new_value)
+                        elif value.dtype.kind in {'f'}:
+                            new_value = _flatmap(float, new_value)
+                        elif value.dtype.kind in {'c'}:
+                            new_value = _flatmap(complex, new_value)
+                        else:
+                            pass
+                            # raise TypeError(value.dtype)
+                to_convert.append((root, key, new_value))
+            elif isinstance(value, torch.Tensor):
+                new_value = value.data.cpu().numpy().tolist()
+                to_convert.append((root, key, new_value))
+            elif isinstance(value, (np.int16, np.int32, np.int64,
+                                    np.uint16, np.uint32, np.uint64)):
+                new_value = int(value)
+                to_convert.append((root, key, new_value))
+            elif isinstance(value, (np.float32, np.float64)):
+                new_value = float(value)
+                to_convert.append((root, key, new_value))
+            elif isinstance(value, (np.complex64, np.complex128)):
+                new_value = complex(value)
+                to_convert.append((root, key, new_value))
+            elif hasattr(value, '__json__'):
+                new_value = value.__json__()
+                to_convert.append((root, key, new_value))
+            elif normalize_containers:
+                if isinstance(value, dict):
+                    new_value = _norm_container(value)
+                    to_convert.append((root, key, new_value))
+
+    for root, key, new_value in to_convert:
+        _convert(dict_, root, key, new_value)
+
+    if normalize_containers:
+        # normalize the outer layer
+        dict_ = _norm_container(dict_)
+    return dict_
diff --git a/netharn/util/util_misc.py b/netharn/util/util_misc.py
index 0ac2d320493192e8d9a4a60ed039f92d2b2bf9d9..2611363830205560d073fb5af23e01708b3048fa 100644
--- a/netharn/util/util_misc.py
+++ b/netharn/util/util_misc.py
@@ -106,3 +106,158 @@ def strip_ansi(text):
     ansi_escape3 = re.compile(r'(\x9B|\x1B\[)[0-?]*[ -/]*[@-~]', flags=re.IGNORECASE)
     text = ansi_escape3.sub('', text)
     return text
+
+
+def align(text, character='=', replchar=None, pos=0):
+    r"""
+    Left justifies text on the left side of character
+
+    Args:
+        text (str): text to align
+        character (str): character to align at
+        replchar (str): replacement character (default=None)
+
+    Returns:
+        str: new_text
+
+    Example:
+        >>> character = '='
+        >>> text = 'a = b=\none = two\nthree = fish\n'
+        >>> print(text)
+        >>> result = (align(text, '='))
+        >>> print(result)
+        a     = b=
+        one   = two
+        three = fish
+    """
+    line_list = text.splitlines()
+    new_lines = align_lines(line_list, character, replchar, pos=pos)
+    new_text = '\n'.join(new_lines)
+    return new_text
+
+
+def align_lines(line_list, character='=', replchar=None, pos=0):
+    r"""
+    Left justifies text on the left side of character
+
+    align_lines
+
+    TODO:
+        clean up and move to ubelt?
+
+    Args:
+        line_list (list of strs):
+        character (str):
+        pos (int or list or None): does one alignment for all chars beyond this
+            column position. If pos is None, then all chars are aligned.
+
+    Returns:
+        list: new_lines
+
+    Example:
+        >>> line_list = 'a = b\none = two\nthree = fish'.split('\n')
+        >>> character = '='
+        >>> new_lines = align_lines(line_list, character)
+        >>> result = ('\n'.join(new_lines))
+        >>> print(result)
+        a     = b
+        one   = two
+        three = fish
+
+    Example:
+        >>> line_list = 'foofish:\n    a = b\n    one    = two\n    three    = fish'.split('\n')
+        >>> character = '='
+        >>> new_lines = align_lines(line_list, character)
+        >>> result = ('\n'.join(new_lines))
+        >>> print(result)
+        foofish:
+            a        = b
+            one      = two
+            three    = fish
+
+    Example:
+        >>> import ubelt as ub
+        >>> character = ':'
+        >>> text = ub.codeblock('''
+            {'max': '1970/01/01 02:30:13',
+             'mean': '1970/01/01 01:10:15',
+             'min': '1970/01/01 00:01:41',
+             'range': '2:28:32',
+             'std': '1:13:57',}''').split('\n')
+        >>> new_lines = align_lines(text, ':', ' :')
+        >>> result = '\n'.join(new_lines)
+        >>> print(result)
+        {'max'   : '1970/01/01 02:30:13',
+         'mean'  : '1970/01/01 01:10:15',
+         'min'   : '1970/01/01 00:01:41',
+         'range' : '2:28:32',
+         'std'   : '1:13:57',}
+
+    Example:
+        >>> line_list = 'foofish:\n a = b = c\n one = two = three\nthree=4= fish'.split('\n')
+        >>> character = '='
+        >>> # align the second occurence of a character
+        >>> new_lines = align_lines(line_list, character, pos=None)
+        >>> print(('\n'.join(line_list)))
+        >>> result = ('\n'.join(new_lines))
+        >>> print(result)
+        foofish:
+         a   = b   = c
+         one = two = three
+        three=4    = fish
+    """
+
+    # FIXME: continue to fix ansi
+    if pos is None:
+        # Align all occurences
+        num_pos = max([line.count(character) for line in line_list])
+        pos = list(range(num_pos))
+
+    # Allow multiple alignments
+    if isinstance(pos, list):
+        pos_list = pos
+        # recursive calls
+        new_lines = line_list
+        for pos in pos_list:
+            new_lines = align_lines(new_lines, character=character,
+                                    replchar=replchar, pos=pos)
+        return new_lines
+
+    # base case
+    if replchar is None:
+        replchar = character
+
+    # the pos-th character to align
+    lpos = pos
+    rpos = lpos + 1
+
+    tup_list = [line.split(character) for line in line_list]
+
+    handle_ansi = True
+    if handle_ansi:
+        # Remove ansi from length calculation
+        # References: http://stackoverflow.com/questions/14693701remove-ansi
+        ansi_escape = re.compile(r'\x1b[^m]*m')
+
+    # Find how much padding is needed
+    maxlen = 0
+    for tup in tup_list:
+        if len(tup) >= rpos + 1:
+            if handle_ansi:
+                tup = [ansi_escape.sub('', x) for x in tup]
+            left_lenlist = list(map(len, tup[0:rpos]))
+            left_len = sum(left_lenlist) + lpos * len(replchar)
+            maxlen = max(maxlen, left_len)
+
+    # Pad each line to align the pos-th occurence of the chosen character
+    new_lines = []
+    for tup in tup_list:
+        if len(tup) >= rpos + 1:
+            lhs = character.join(tup[0:rpos])
+            rhs = character.join(tup[rpos:])
+            # pad the new line with requested justification
+            newline = lhs.ljust(maxlen) + replchar + rhs
+            new_lines.append(newline)
+        else:
+            new_lines.append(replchar.join(tup))
+    return new_lines
diff --git a/netharn/util/util_torch.py b/netharn/util/util_torch.py
index 94b56dddb9da7999a3342cd0331a936587feaa34..e244513f8b44fdd1313cb3aa7e6b3d5ea0011466 100644
--- a/netharn/util/util_torch.py
+++ b/netharn/util/util_torch.py
@@ -10,6 +10,18 @@ class ModuleMixin(object):
     """
     Adds convenience functions to a torch module
     """
+
+    def trainable_layers(self, names=False):
+        """
+        Get the layers netharn identifies as "trainable"
+
+        Example:
+            >>> import torchvision
+            >>> model = torchvision.models.AlexNet()
+            >>> list(ModuleMixin.trainable_layers(model, names=True))
+        """
+        return trainable_layers(self, names=names)
+
     def number_of_parameters(self, trainable=True):
         """
         Tally the number of model paramters.
@@ -310,7 +322,7 @@ def trainable_layers(model, names=False):
     """
     Note:
         This was moved to netharn.initializers.functional.
-        Is this still needed in util?
+        Move it back here, or do some other refactoring.
 
     Example:
         >>> import torchvision
diff --git a/requirements/optional.txt b/requirements/optional.txt
index 77efca010e08bef7f1fc1628557650d245f0118d..0d5296d3a8b8f887ea61b277694428c46bd3a9cc 100644
--- a/requirements/optional.txt
+++ b/requirements/optional.txt
@@ -20,3 +20,5 @@ ndsampler >= 0.5.0
 # pyqt5>= 5.11.2;python_version>'2.7'  # 
 
 colormath
+
+torch-optimizer >= 0.0.1a9 ;python_version>='3.6'
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 176d0bba53073668f54ebad3cf51d35683e6e4b9..39edb49259b60babcb017d3e881a2ef18178e5f3 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,5 +1,13 @@
+# Note: torch is not published on pypi for windows
+# We could add the line:  
+# --find-links=https://download.pytorch.org/whl/torch_stable.html
+# but that doesn't play well with setuptools. The current recommendation is to
+# install torch yourself on windows. Hopefully there will be a better alternative
+# in the future.
+
 torch >= 1.0.0
 torchvision >= 0.2.0
+
 six >= 1.11.0
 numpy >= 1.9.0
 ubelt >= 0.8.4
diff --git a/run_developer_setup.sh b/run_developer_setup.sh
index 7c7592f18599a6e20d5b9a415173b6aab19a12c4..e4c4923251055dbc0254ec9e814ccabbc8b35b77 100755
--- a/run_developer_setup.sh
+++ b/run_developer_setup.sh
@@ -4,4 +4,5 @@ pip install -r requirements.txt
 
 # Install netharn in developer mode
 #pip install -e .
-python setup.py clean && python setup.py develop
+#python setup.py clean && python setup.py develop
+pip install -e .
diff --git a/setup.py b/setup.py
index 05602610a5f0e5ce14568d8296df9a1c1378352f..6be4f3cd694fa2d973bb7f4ca13d32f7e85297f1 100755
--- a/setup.py
+++ b/setup.py
@@ -67,6 +67,9 @@ def parse_requirements(fname='requirements.txt', with_version=False):
     Returns:
         List[str]: list of requirements items
 
+    References:
+        https://pip.readthedocs.io/en/1.1/requirements.html
+
     CommandLine:
         python -c "import setup; print(setup.parse_requirements())"
         python -c "import setup; print(chr(10).join(setup.parse_requirements(with_version=True)))"
@@ -79,7 +82,12 @@ def parse_requirements(fname='requirements.txt', with_version=False):
         """
         Parse information from a line in a requirements text file
         """
-        if line.startswith('-r '):
+        if line.startswith(('-f ', '--find-links ', '--index-url ')):
+            import warnings
+            warnings.warn(
+                'requirements file specified alternative index urls, but '
+                'there is currently no way to support this in setuptools')
+        elif line.startswith('-r '):
             # Allow specifying requirements in other files
             new_fname = line.split(' ')[1]
             new_fpath = join(base, new_fname)
@@ -248,12 +256,15 @@ if __name__ == '__main__':
             # https://pypi.python.org/pypi?%3Aaction=list_classifiers
             'Development Status :: 4 - Beta',
             'Intended Audience :: Developers',
+            'Intended Audience :: Science/Research',
+            'Topic :: Scientific/Engineering',
+            'Topic :: Scientific/Engineering :: Artificial Intelligence',
+            'Topic :: Software Development',
             'Topic :: Software Development :: Libraries :: Python Modules',
             'Topic :: Utilities',
             # This should be interpreted as Apache License v2.0
             'License :: OSI Approved :: Apache Software License',
             # Supported Python versions
-            'Programming Language :: Python :: 2.7',
             'Programming Language :: Python :: 3',
         ],
     )
diff --git a/super_setup.py b/super_setup.py
index 8234cdcbdd550bf060d06ee6897957a882702b95..21c4f55ead2651d1aec41d511577c44f01f716d6 100755
--- a/super_setup.py
+++ b/super_setup.py
@@ -296,8 +296,12 @@ class Repo(ub.NiceRepr):
         Args:
             protocol (str): can be ssh or https
         """
+        # Update base url to use the requested protocol
         gurl = GitURL(self.url)
         self.url = gurl.format(protocol)
+        # Update all remote urls to use the requested protocol
+        for key in list(self.remotes.keys()):
+            self.remotes[key] = GitURL(self.remotes[key]).format(protocol)
 
     def info(repo, msg):
         repo._logged_lines.append(('INFO', 'INFO: ' + msg))
@@ -346,12 +350,21 @@ class Repo(ub.NiceRepr):
         return repo._pygit
 
     def develop(repo):
-        devsetup_script_fpath = join(repo.dpath, 'run_developer_setup.sh')
-        if not exists(devsetup_script_fpath):
-            raise AssertionError('Assume we always have run_developer_setup.sh: repo={!r}'.format(repo))
-        repo._cmd(devsetup_script_fpath, cwd=repo.dpath)
+        if ub.WIN32:
+            # We can't run a shell file on win32, so lets hope this works
+            import warnings
+            warnings.warn('super_setup develop may not work on win32')
+            repo._cmd('pip install -e .', cwd=repo.dpath)
+        else:
+            devsetup_script_fpath = join(repo.dpath, 'run_developer_setup.sh')
+            if not exists(devsetup_script_fpath):
+                raise AssertionError('Assume we always have run_developer_setup.sh: repo={!r}'.format(repo))
+            repo._cmd(devsetup_script_fpath, cwd=repo.dpath)
 
     def doctest(repo):
+        if ub.WIN32:
+            raise NotImplementedError('doctest does not yet work on windows')
+
         devsetup_script_fpath = join(repo.dpath, 'run_doctests.sh')
         if not exists(devsetup_script_fpath):
             raise AssertionError('Assume we always have run_doctests.sh: repo={!r}'.format(repo))
@@ -534,6 +547,11 @@ class Repo(ub.NiceRepr):
 
     def pull(repo):
         repo._assert_clean()
+        # TODO: In past runs I've gotten the error:
+        # Your configuration specifies to merge with the ref
+        # 'refs/heads/dev/0.0.2' from the remote, but no such ref was fetched.
+        # Doing an ensure seemed to fix it. We should do something to handle
+        # this case ellegantly.
         repo._cmd('git pull')
 
     def status(repo):
@@ -668,11 +686,11 @@ def make_netharn_registry():
 
         # The util libs
         CommonRepo(
-            name='kwarray', branch='dev/0.5.4', remote='public',
+            name='kwarray', branch='dev/0.5.7', remote='public',
             remotes={'public': 'git@gitlab.kitware.com:computer-vision/kwarray.git'},
         ),
         CommonRepo(
-            name='kwimage', branch='dev/0.6.0', remote='public',
+            name='kwimage', branch='dev/0.6.2', remote='public',
             remotes={'public': 'git@gitlab.kitware.com:computer-vision/kwimage.git'},
         ),
         # CommonRepo(  # TODO
@@ -680,24 +698,38 @@ def make_netharn_registry():
         #     remotes={'public': 'git@gitlab.kitware.com:computer-vision/kwannot.git'},
         # ),
         CommonRepo(
-            name='kwplot', branch='dev/0.4.3', remote='public',
+            name='kwcoco', branch='dev/0.1.1', remote='public',
+            remotes={'public': 'git@gitlab.kitware.com:computer-vision/kwcoco.git'},
+        ),
+        CommonRepo(
+            name='kwplot', branch='dev/0.4.4', remote='public',
             remotes={'public': 'git@gitlab.kitware.com:computer-vision/kwplot.git'},
         ),
 
+        # Pytorch deployer / exporter
+        CommonRepo(
+            name='liberator', branch='dev/0.0.2', remote='public',
+            remotes={'public': 'git@gitlab.kitware.com:python/liberator.git'},
+        ),
+        CommonRepo(
+            name='torch_liberator', branch='dev/0.0.3', remote='public',
+            remotes={'public': 'git@gitlab.kitware.com:computer-vision/torch_liberator.git'},
+        ),
+
 
         # For example data and CLI
         CommonRepo(
-            name='scriptconfig', branch='dev/0.5.4', remote='public',
+            name='scriptconfig', branch='dev/0.5.6', remote='public',
             remotes={'public': 'git@gitlab.kitware.com:utils/scriptconfig.git'},
         ),
         CommonRepo(
-            name='ndsampler', branch='dev/0.5.4', remote='public',
+            name='ndsampler', branch='dev/0.5.8', remote='public',
             remotes={'public': 'git@gitlab.kitware.com:computer-vision/ndsampler.git'},
         ),
 
         # netharn - training harness
         CommonRepo(
-            name='netharn', branch='dev/0.5.4', remote='public',
+            name='netharn', branch='dev/0.5.5', remote='public',
             remotes={'public': 'git@gitlab.kitware.com:computer-vision/netharn.git'},
         ),
     ]
@@ -812,6 +844,21 @@ def main():
     cli_group()
 
 
+_DOCKER_DEBUGGING = """
+DOCKER_IMAGE=circleci/python
+docker run -v $PWD:/io --rm -it $DOCKER_IMAGE bash
+
+mkdir -p $HOME/code
+cd $HOME/code
+git clone -b dev/0.5.5 https://gitlab.kitware.com/computer-vision/netharn.git
+cd $HOME/code/netharn
+
+pip install -r requirements/super_setup.txt
+python super_setup.py ensure --serial
+
+"""
+
+
 if __name__ == '__main__':
     """
     For autocomplete you must run in bash