Source code for tensorpack.callbacks.inference_runner

# -*- coding: utf-8 -*-
# File: inference_runner.py


import itertools
import sys
from contextlib import contextmanager
import tqdm
from tensorflow.python.training.monitored_session import _HookedSession as HookedSession

from ..compat import tfv1 as tf
from ..dataflow.base import DataFlow
from ..input_source import FeedInput, InputSource, QueueInput, StagingInput
from ..tfutils.tower import PredictTowerContext
from ..utils import logger
from ..utils.utils import get_tqdm_kwargs
from .base import Callback
from .group import Callbacks
from .inference import Inferencer

__all__ = ['InferenceRunnerBase', 'InferenceRunner',
           'DataParallelInferenceRunner']


def _device_from_int(dev):
    return '/gpu:{}'.format(dev) if dev >= 0 else '/cpu:0'


class InferencerToHook(tf.train.SessionRunHook):
    def __init__(self, inf, fetches):
        self._inf = inf
        self._fetches = fetches

    def before_run(self, _):
        return tf.train.SessionRunArgs(fetches=self._fetches)

    def after_run(self, _, run_values):
        self._inf.on_fetches(run_values.results)


@contextmanager
def _inference_context():
    msg = "You might need to check your input implementation."
    try:
        yield
    except (StopIteration, tf.errors.CancelledError):
        logger.error(
            "[InferenceRunner] input stopped before reaching its __len__()! " + msg)
        raise
    except tf.errors.OutOfRangeError:   # tf.data reaches an end
        pass


[docs]class InferenceRunnerBase(Callback): """ Base class for inference runner. Note: 1. InferenceRunner will use `input.size()` to determine how much iterations to run, so you're responsible to ensure that `input.size()` is accurate. 2. Only works with instances of `TowerTrainer`. """
[docs] def __init__(self, input, infs): """ Args: input (InputSource): the input to use. Must have an accurate ``size()``. infs (list[Inferencer]): list of :class:`Inferencer` to run. """ self._input_source = input if not isinstance(infs, list): self.infs = [infs] else: self.infs = infs for v in self.infs: assert isinstance(v, Inferencer), v try: self._size = input.size() except NotImplementedError: self._size = 0 self._hooks = []
[docs] def register_hook(self, hook): """ Args: hook (tf.train.SessionRunHook): """ self._hooks.append(hook)
def _before_train(self): self._hooked_sess = HookedSession(self.trainer.sess, self._hooks) self._input_callbacks.before_train() if self._size > 0: logger.info("[InferenceRunner] Will eval {} iterations".format(self._size)) else: logger.warn("[InferenceRunner] Got an InputSource with unknown size! Will iterate until OutOfRangeError!") def _after_train(self): self._input_callbacks.after_train()
[docs]class InferenceRunner(InferenceRunnerBase): """ A callback that runs a list of :class:`Inferencer` on some :class:`InputSource`. """
[docs] def __init__(self, input, infs, tower_name='InferenceTower', tower_func=None, device=0): """ Args: input (InputSource or DataFlow): The :class:`InputSource` to run inference on. If given a DataFlow, will use :class:`FeedInput`. infs (list): a list of :class:`Inferencer` instances. tower_name (str): the name scope of the tower to build. If multiple InferenceRunner are used, each needs a different tower_name. tower_func (tfutils.TowerFunc or None): the tower function to be used to build the graph. By defaults to call `trainer.tower_func` under a `training=False` TowerContext, but you can change it to a different tower function if you need to inference with several different graphs. device (int): the device to use """ if isinstance(input, DataFlow): # use infinite=False so that a dataflow without size will stop normally # TODO a better way to handle inference size input = FeedInput(input, infinite=False) assert isinstance(input, InputSource), input assert not isinstance(input, StagingInput), input self._tower_name = tower_name self._device_id = device self._device = _device_from_int(device) self._tower_func = tower_func super(InferenceRunner, self).__init__(input, infs)
def _build_hook(self, inf): out_names = inf.get_fetches() fetches = self._tower_handle.get_tensors(out_names) return InferencerToHook(inf, fetches) def _setup_graph(self): if self._tower_func is None: assert self.trainer.tower_func is not None, "You must set tower_func of the trainer to use InferenceRunner!" self._tower_func = self.trainer.tower_func input_callbacks = self._input_source.setup(self._tower_func.input_signature) vs_name = self.trainer._vs_name_for_predictor(self._device_id) logger.info("[InferenceRunner] Building tower '{}' on device {} {}...".format( self._tower_name, self._device, "with variable scope '{}'".format(vs_name) if vs_name else '')) with tf.variable_scope(tf.get_variable_scope(), reuse=True), \ tf.device(self._device), \ PredictTowerContext(self._tower_name, vs_name=vs_name): self._tower_func(*self._input_source.get_input_tensors()) self._tower_handle = self._tower_func.towers[-1] for h in [self._build_hook(inf) for inf in self.infs]: self.register_hook(h) # trigger_{step,epoch}, {before,after}_epoch is ignored. # We assume that InputSource callbacks won't use these methods self._input_callbacks = Callbacks(input_callbacks) for h in self._input_callbacks.get_hooks(): self.register_hook(h) for inf in self.infs: inf.setup_graph(self.trainer) self._input_callbacks.setup_graph(self.trainer) def _trigger(self): for inf in self.infs: inf.before_epoch() self._input_source.reset_state() # iterate over the data, and run the hooked session with _inference_context(), \ tqdm.tqdm(total=self._size, **get_tqdm_kwargs()) as pbar: num_itr = self._size if self._size > 0 else sys.maxsize for _ in range(num_itr): self._hooked_sess.run(fetches=[]) pbar.update() for inf in self.infs: inf.trigger_epoch()
[docs]class DataParallelInferenceRunner(InferenceRunnerBase): """ Inference with data-parallel support on multiple GPUs. It will build one predict tower on each GPU, and run prediction with a large total batch in parallel on all GPUs. It will run the remainder (when the total size of input is not a multiple of #GPU) sequentially. """
[docs] def __init__(self, input, infs, gpus, tower_name='InferenceTower', tower_func=None): """ Args: input (DataFlow or QueueInput) gpus (int or list[int]): #gpus, or list of GPU id tower_name (str): the name scope of the tower to build. If multiple InferenceRunner are used, each needs a different tower_name. tower_func (tfutils.TowerFunc or None): the tower function to be used to build the graph. The tower function will be called under a `training=False` TowerContext. The default is `trainer.tower_func`, but you can change it to a different tower function if you need to inference with several different models. """ if isinstance(gpus, int): gpus = list(range(gpus)) self._devices = [_device_from_int(k) for k in gpus] self._tower_names = ['{}{}'.format(tower_name, k) for k in range(len(gpus))] if isinstance(input, DataFlow): input = QueueInput(input) assert isinstance(input, QueueInput), input super(DataParallelInferenceRunner, self).__init__(input, infs) assert self._size > 0, "Input for DataParallelInferenceRunner must have a size!" self._hooks = [] self._hooks_parallel = [] self._tower_func = tower_func
def _setup_graph(self): self._handles = [] if self._tower_func is None: assert self.trainer.tower_func is not None, "You must set tower_func of the trainer to use InferenceRunner!" self._tower_func = self.trainer.tower_func input_callbacks = self._input_source.setup(self._tower_func.input_signature) with tf.variable_scope(tf.get_variable_scope(), reuse=True): for idx, dev in enumerate(self._devices): vs_name = self.trainer._vs_name_for_predictor(idx) with tf.device(dev), PredictTowerContext( self._tower_names[idx], vs_name=vs_name): logger.info("[InferenceRunner] Building tower '{}' on device {} {}...".format( self._tower_names[idx], dev, "with variable scope '{}'".format(vs_name) if vs_name else '')) # TODO log for tower creation, here or in tower.py? self._tower_func(*self._input_source.get_input_tensors()) self._handles.append(self._tower_func.towers[-1]) # setup callbacks and hooks self._input_callbacks = Callbacks(input_callbacks) # TODO InputSource might have hooks which break us. # e.g. hooks from StagingInput will force the consumption # of nr_tower datapoints in every run. input_hooks = self._input_callbacks.get_hooks() self._hooks.extend([self._build_hook(inf) for inf in self.infs] + input_hooks) self._hooks_parallel.extend([self._build_hook_parallel(inf) for inf in self.infs] + input_hooks) for inf in self.infs: inf.setup_graph(self.trainer) self._input_callbacks.setup_graph(self.trainer)
[docs] def register_hook(self, h): logger.info( "[DataParallelInferenceRunner] Registering hook {} on both parallel and sequential inference.") self._hooks.append(h) self._hooks_parallel.append(h)
class _InferencerToHookDataParallel(InferencerToHook): def __init__(self, inf, fetches, size): """ Args: size(int): number of tensors to fetch per tower """ super(DataParallelInferenceRunner._InferencerToHookDataParallel, self).__init__(inf, fetches) assert len(self._fetches) % size == 0 self._sz = size def after_run(self, _, run_values): res = run_values.results for i in range(0, len(res), self._sz): vals = res[i:i + self._sz] self._inf.on_fetches(vals) def _build_hook_parallel(self, inf): out_names = inf.get_fetches() sz = len(out_names) fetches = list(itertools.chain(*[t.get_tensors(out_names) for t in self._handles])) return self._InferencerToHookDataParallel(inf, fetches, sz) def _build_hook(self, inf): out_names = inf.get_fetches() fetches = self._handles[0].get_tensors(out_names) return InferencerToHook(inf, fetches) def _before_train(self): super(DataParallelInferenceRunner, self)._before_train() self._parallel_hooked_sess = HookedSession(self.trainer.sess, self._hooks_parallel) def _trigger(self): for inf in self.infs: inf.before_epoch() total = self._size nr_tower = len(self._devices) self._input_source.reset_state() with _inference_context(): with tqdm.tqdm(total=total, **get_tqdm_kwargs()) as pbar: while total >= nr_tower: self._parallel_hooked_sess.run(fetches=[]) pbar.update(nr_tower) total -= nr_tower # take care of the rest for _ in range(total): self._hooked_sess.run(fetches=[]) pbar.update(1) for inf in self.infs: inf.trigger_epoch()