Source code for finetune.config

import logging
import os
import os.path
import subprocess
import warnings
from collections import namedtuple
from functools import lru_cache

import numpy as np
from nltk.metrics.distance import edit_distance

import finetune
from finetune.errors import FinetuneError
from finetune.base_models import GPTModel, GPT2Model, BERT

LOGGER = logging.getLogger('finetune')


def finetune_model_path(path):
    return os.path.abspath(
        os.path.join(
            os.path.dirname(finetune.__file__),
            'model',
            path
        )
    )


def nvidia_device_ids():
    sp = subprocess.Popen(['nvidia-smi -L'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    response = sp.communicate()[0]
    gpu_list = response.decode('utf-8').strip().split('\n')
    device_ids = {}
    for i, gpu in enumerate(gpu_list):
        # May be worth logging GPU description
        device_id_str, _, description = gpu.partition(':')
        assert int(device_id_str.split(' ')[-1]) == i
        device_ids[i] = description
    return device_ids


@lru_cache()
def all_gpus(visible_gpus=None):
    """
    Get integer ids of all available GPUs.

    Sample response from nvidia-smi -L:
        GPU 0: GeForce GTX 980 (UUID: GPU-2d683060-957f-d5ad-123c-a5b49b0116d9)
        GPU 1: GeForce GTX 980 (UUID: GPU-7b8496dc-3eaf-8db7-01e7-c4a884f66acf)
        GPU 2: GeForce GTX TITAN X (UUID: GPU-9e01f108-e7de-becd-2589-966dcc1c778f)
    """
    if visible_gpus is not None:
        visible_gpus = [int(gpu) for gpu in visible_gpus]
    try:
        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
        device_ids = nvidia_device_ids()
        mapping = None
        # restricting GPUs based on env vars
        if cuda_visible_devices:
            device_ids = {
                device_id: description
                for device_id, description in device_ids.items()
                if str(device_id) in cuda_visible_devices.split(',')
            }
            mapping = {dev_id: i for i, (dev_id, _) in enumerate(sorted(device_ids.items()))}
            
        # restricting GPUs based on config
        if visible_gpus is not None:
            device_ids = {
                device_id: description
                for device_id, description in device_ids.items()
                if device_id in visible_gpus
            }

        LOGGER.info(" Visible GPUs: {{{}}}".format(
            ", ".join([
                "{}:{}".format(device_id, description.split('(')[0]).strip()
                for device_id, description in device_ids.items()
            ])
        ))

        if mapping is not None:
            # Resolve these to internal tensorflow device ids.
            # These are equivalent if no visible_devices masking is used
            device_ids = {
                mapping[device_id]: description
                for device_id, description in device_ids.items()
            }

        device_ids = list(device_ids.keys())
    except:
        # Failed to parse out available GPUs properly
        warnings.warn("Failed to find available GPUS.  Falling back to CPU only mode.")
        device_ids = []

    return device_ids

GridSearchable = namedtuple("GridSearchable", "default iterator")


[docs]class Settings(dict):
    """
    Model configuration options

    :param batch_size: Number of examples per batch, defaults to `2`.
    :param visible_gpus: List of integer GPU ids to spread out computation across, defaults to all available GPUs.
    :param n_epochs: Number of iterations through training data, defaults to `3`.
    :param random_seed: Random seed to use for repeatability purposes, defaults to `42`.
    :param max_length:  Maximum number of subtokens per sequence. Examples longer than this number will be truncated
        (unless `chunk_long_sequences=True` for SequenceLabeler models). Defaults to `512`.
    :param weight_stddev: Standard deviation of initial weights.  Defaults to `0.02`.
    :param chunk_long_sequences: When True, use a sliding window approach to predict on
        examples that are longer than max length.  Defaults to `False`.
    :param low_memory_mode: When True, only store partial gradients on forward pass
        and recompute remaining gradients incrementally in order to save memory.  Defaults to `False`.
    :param interpolate_pos_embed: Interpolate positional embeddings when `max_length` differs from it's original value of
        `512`. Defaults to `False`.
    :param embed_p_drop: Embedding dropout probability.  Defaults to `0.1`.
    :param attn_p_drop: Attention dropout probability.  Defaults to `0.1`.
    :param resid_p_drop: Residual layer fully connected network dropout probability.  Defaults to `0.1`.
    :param clf_p_drop: Classifier dropout probability.  Defaults to `0.1`.
    :param l2_reg: L2 regularization coefficient. Defaults to `0.01`.
    :param vector_l2: Whether to apply weight decay regularization to vectors (biases, normalization etc..). Defaults to False.
    :param optimizer: Optimizer to use, current options include AdamW or AdamaxW.
    :param b1: Adam b1 parameter.  Defaults to `0.9`.
    :param b2: Adam b2 parameter.  Defaults to `0.999`.
    :param epsilon: Adam epsilon parameter: Defaults to `1e-8`.
    :param lr_schedule: Learning rate schedule -- see `finetune/optimizers.py` for more options.
    :param lr: Learning rate.  Defaults to `6.25e-5`.
    :param lr_warmup: Learning rate warmup (percentage of all batches to warmup for).  Defaults to `0.002`.
    :param max_grad_norm: Clip gradients larger than this norm. Defaults to `1.0`.
    :param accum_steps: Number of updates to accumulate before applying. This is used to simulate a higher batch size.
    :param lm_loss_coef: Language modeling loss coefficient -- a value between `0.0` - `1.0`
        that indicates how to trade off between language modeling loss
        and target model loss.  Usually not beneficial to turn on unless
        dataset size exceeds a few thousand examples.  Defaults to `0.0`.
    :param summarize_grads: Include gradient summary information in tensorboard.  Defaults to `False`.
    :param val_size: Validation set size if int. Validation set size as percentage of all training data if float.  Validation will not be run by default if n_examples < 50.
        If n_examples > 50, defaults to max(5, min(100, 0.05 * n_examples))
    :param val_interval: Evaluate on validation set after `val_interval` batches.
        Defaults to 4 * val_size / batch_size to ensure that too much time is not spent on validation.
    :param lm_temp: Language model temperature -- a value of `0.0` corresponds to greedy maximum likelihood predictions
        while a value of `1.0` corresponds to random predictions. Defaults to `0.2`.
    :param seq_num_heads: Number of attention heads of final attention layer. Defaults to `16`.
    :param subtoken_predictions: Return predictions at subtoken granularity or token granularity?  Defaults to `False`.
    :param multi_label_sequences: Use a multi-labeling approach to sequence labeling to allow overlapping labels.
    :param multi_label_threshold: Threshold of sigmoid unit in multi label classifier.
        Can be increased or lowered to trade off precision / recall. Defaults to `0.5`.
    :param autosave_path: Save current best model (as measured by validation loss) to this location. Defaults to `None`.
    :param tensorboard_folder: Directory for tensorboard logs. Tensorboard logs will not be written
        unless tensorboard_folder is explicitly provided. Defaults to `None`.
    :param log_device_placement: Log which device each operation is placed on for debugging purposes.  Defaults to `False`.
    :param allow_soft_placement: Allow tf to allocate an operation to a different device if a device is unavailable.  Defaults to `True`.
    :param save_adam_vars: Save adam parameters when calling `model.save()`.  Defaults to `True`.
    :param num_layers_trained: How many layers to finetune.  Specifying a value less than 12 will train layers starting from model output. Defaults to `12`.
    :param train_embeddings: Should embedding layer be finetuned? Defaults to `True`.
    :param class_weights: One of 'log', 'linear', or 'sqrt'. Auto-scales gradient updates based on class frequency.  Can also be a dictionary that maps from true class name to loss coefficient. Defaults to `None`.
    :param oversample: Should rare classes be oversampled?  Defaults to `False`.
    :param params_device: Which device should gradient updates be aggregated on?
        If you are using a single GPU and have more than 4Gb of GPU memory you should set this to GPU PCI number (0, 1, 2, etc.). Defaults to `"cpu"`.
    :param eval_acc: if True, calculates accuracy and writes it to the tensorboard summary files for valudation runs.
    :param save_dtype: specifies what precision to save model weights with.  Defaults to `np.float32`.
    :param regression_loss: the loss to use for regression models. One of `L1` or `L2`, defaults to `L2`.
    :param prefit_init: if True, fit target model weigths before finetuning the entire model. Defaults to `False`.
    :param debugging_logs: if True, output tensorflow logs and turn off TQDM logging. Defaults to `False`.
    :param val_set: Where it is neccessary to use an explicit validation set, provide it here as a tuple (text, labels)
    :param per_process_gpu_memory_fraction: fraction of the overall amount of memory that each visible GPU should be allocated, defaults to `1.0`.
    :param adapter_size: width of adapter module from 'Parameter Efficient Transfer Learning' paper, if defined. defaults to 'None'.
    """

    def get_grid_searchable(self):
        return self.grid_searchable

    def __init__(self, **kwargs):
        super().__init__()
        self.grid_searchable = {}
        for key, value in kwargs.items():
            self[key] = value

    def __getattr__(self, attr):
        if attr.startswith('__'):
            raise AttributeError

        if attr == "base_model_path":
            full_path = finetune_model_path(self["base_model_path"])
            if os.path.exists(full_path):
                return full_path

        return self[attr]

    def __setitem__(self, key, value):
        if isinstance(value, GridSearchable):
            self.grid_searchable[key] = value.iterator
            value = value.default
        return super().__setitem__(key, value)

    def __setattr__(self, k, v):
        return self.__setitem__(k, v)

    __delattr__ = dict.__delitem__


def did_you_mean(keyword, keyword_pool):
    candidates = list(keyword_pool)
    closest_match_idx = np.argmin([
        edit_distance(keyword, candidate) for candidate in candidates
    ])
    return candidates[closest_match_idx]


def assert_valid_config(**kwargs):
    expected_keys = set(get_default_config().keys())
    for kwarg in kwargs:
        if kwarg not in expected_keys:
            raise FinetuneError(
                "Unexpected setting configuration: `{}` is an invalid keyword. "
                "Did you mean `{}`?".format(kwarg, did_you_mean(kwarg, expected_keys))
            )


def get_default_config():
    """
    Gets a config object containing all the default parameters for each variant of the model.

    :return: Config object.
    """
    settings = Settings(
        # General Settings
        low_memory_mode=False,
        interpolate_pos_embed=False,
        save_adam_vars=True,
        shuffle_buffer_size=100,
        dataset_size=None,
        batch_size=2,
        visible_gpus=None, # defaults to all available
        n_epochs=GridSearchable(3, [1, 2, 3, 4]),
        seed=42,
        max_length=512,
        weight_stddev=0.02,
        save_dtype=None,
        val_set=None,
        per_process_gpu_memory_fraction=0.95,
        adapter_size = None, #from Parameter Efficient Transfer Learning paper

        # Regularization
        embed_p_drop=0.1,
        attn_p_drop=0.1,
        resid_p_drop=0.1,
        clf_p_drop=0.1,
        l2_reg=GridSearchable(0.01, [0.0, 0.1, 0.01, 0.001]),
        vector_l2=False,

        # Early Stopping and Validation
        autosave_path=None,
        keep_best_model=False,
        early_stopping_steps=None,
        min_secs_between_eval=60,
        eval_acc=False,
        val_size=None,
        val_interval=None,

        # Debugging
        log_device_placement=False,
        soft_device_placement=True,
        tensorboard_folder=None,
        summarize_grads=False,
        debugging_logs=False,

        # Partial Fitting
        num_layers_trained=12,
        train_embeddings=True,

        # Class Imbalance
        class_weights=None,
        oversample=False,
        params_device="cpu",

        # Optimization Params
        optimizer="AdamW",
        b1=0.9,
        b2=0.999,
        epsilon=1e-8,
        lr_schedule='warmup_linear',
        lr=GridSearchable(6.25e-5, [6.25e-4, 6.25e-5, 6.25e-6]),
        lr_warmup=0.002,
        max_grad_norm=1.0,
        prefit_init=False,
        accum_steps=1,

        # MTL
        tasks=None,
        dont_optimize_zero_gradients=False,

        # Language Model Settings
        lm_loss_coef=0.0,
        lm_temp=0.2,

        # Sequence Labeling
        seq_num_heads=16,
        pad_token="<PAD>",
        pad_idx=None,
        subtoken_predictions=False,
        multi_label_sequences=False,
        multi_label_threshold=0.5,
        chunk_long_sequences=False,

        # Regression Params
        regression_loss="L2",

        # Association Params
        viable_edges=None,
        association_types=None,
        assocation_loss_weight=100.0,

        # Location of model weights
        base_model=GPTModel,
        base_model_path=None,

        # Possible `SourceModel` specific settings
        n_heads=None,
        n_layer=None,
        act_fn=None,
        n_embed=None,

        # for TextCNN SourceModel only
        kernel_sizes=None,
        num_filters_per_size=None,
        n_embed_featurizer=None, # needed because the dimensions CNN output are different from the embedding dimensions

        # BERT only
        bert_intermediate_size=None
    )
    return settings


def get_config(**kwargs):
    """
    Gets a config object containing all the default parameters for each variant of the model.

    :param **kwargs: Keyword arguments to override default values.
    :return: Config object.    """
    assert_valid_config(**kwargs)
    config = get_default_config()
    config.base_model = kwargs.get('base_model', config.base_model)
    config.update(config.base_model.settings)
    config.update(kwargs)
    return config


def cpu_config(**kwargs):
    config = get_config(**kwargs)
    config.visible_gpus = []
    config.update(kwargs)
    return config