Source code for finetune.config

import logging
import os
import os.path
import subprocess
import warnings
from collections import namedtuple
from functools import lru_cache

import numpy as np
from nltk.metrics.distance import edit_distance

import finetune
from finetune.errors import FinetuneError
from finetune.base_models import GPTModel, GPT2Model, BERT

LOGGER = logging.getLogger('finetune')


def finetune_model_path(path):
    return os.path.abspath(
        os.path.join(
            os.path.dirname(finetune.__file__),
            'model',
            path
        )
    )


def nvidia_device_ids():
    sp = subprocess.Popen(['nvidia-smi -L'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    response = sp.communicate()[0]
    gpu_list = response.decode('utf-8').strip().split('\n')
    device_ids = {}
    for i, gpu in enumerate(gpu_list):
        # May be worth logging GPU description
        device_id_str, _, description = gpu.partition(':')
        assert int(device_id_str.split(' ')[-1]) == i
        device_ids[i] = description
    return device_ids


@lru_cache()
def all_gpus(visible_gpus=None):
    """
    Get integer ids of all available GPUs.

    Sample response from nvidia-smi -L:
        GPU 0: GeForce GTX 980 (UUID: GPU-2d683060-957f-d5ad-123c-a5b49b0116d9)
        GPU 1: GeForce GTX 980 (UUID: GPU-7b8496dc-3eaf-8db7-01e7-c4a884f66acf)
        GPU 2: GeForce GTX TITAN X (UUID: GPU-9e01f108-e7de-becd-2589-966dcc1c778f)
    """
    if visible_gpus is not None:
        visible_gpus = [int(gpu) for gpu in visible_gpus]
    try:
        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES")
        device_ids = nvidia_device_ids()
        mapping = None
        # restricting GPUs based on env vars
        if cuda_visible_devices:
            device_ids = {
                device_id: description
                for device_id, description in device_ids.items()
                if str(device_id) in cuda_visible_devices.split(',')
            }
            mapping = {dev_id: i for i, (dev_id, _) in enumerate(sorted(device_ids.items()))}
            
        # restricting GPUs based on config
        if visible_gpus is not None:
            device_ids = {
                device_id: description
                for device_id, description in device_ids.items()
                if device_id in visible_gpus
            }

        LOGGER.info(" Visible GPUs: {{{}}}".format(
            ", ".join([
                "{}:{}".format(device_id, description.split('(')[0]).strip()
                for device_id, description in device_ids.items()
            ])
        ))

        if mapping is not None:
            # Resolve these to internal tensorflow device ids.
            # These are equivalent if no visible_devices masking is used
            device_ids = {
                mapping[device_id]: description
                for device_id, description in device_ids.items()
            }

        device_ids = list(device_ids.keys())
    except:
        # Failed to parse out available GPUs properly
        warnings.warn("Failed to find available GPUS.  Falling back to CPU only mode.")
        device_ids = []

    return device_ids

GridSearchable = namedtuple("GridSearchable", "default iterator")


[docs]class Settings(dict): """ Model configuration options :param batch_size: Number of examples per batch, defaults to `2`. :param visible_gpus: List of integer GPU ids to spread out computation across, defaults to all available GPUs. :param n_epochs: Number of iterations through training data, defaults to `3`. :param random_seed: Random seed to use for repeatability purposes, defaults to `42`. :param max_length: Maximum number of subtokens per sequence. Examples longer than this number will be truncated (unless `chunk_long_sequences=True` for SequenceLabeler models). Defaults to `512`. :param weight_stddev: Standard deviation of initial weights. Defaults to `0.02`. :param chunk_long_sequences: When True, use a sliding window approach to predict on examples that are longer than max length. Defaults to `False`. :param low_memory_mode: When True, only store partial gradients on forward pass and recompute remaining gradients incrementally in order to save memory. Defaults to `False`. :param interpolate_pos_embed: Interpolate positional embeddings when `max_length` differs from it's original value of `512`. Defaults to `False`. :param embed_p_drop: Embedding dropout probability. Defaults to `0.1`. :param attn_p_drop: Attention dropout probability. Defaults to `0.1`. :param resid_p_drop: Residual layer fully connected network dropout probability. Defaults to `0.1`. :param clf_p_drop: Classifier dropout probability. Defaults to `0.1`. :param l2_reg: L2 regularization coefficient. Defaults to `0.01`. :param vector_l2: Whether to apply weight decay regularization to vectors (biases, normalization etc..). Defaults to False. :param optimizer: Optimizer to use, current options include AdamW or AdamaxW. :param b1: Adam b1 parameter. Defaults to `0.9`. :param b2: Adam b2 parameter. Defaults to `0.999`. :param epsilon: Adam epsilon parameter: Defaults to `1e-8`. :param lr_schedule: Learning rate schedule -- see `finetune/optimizers.py` for more options. :param lr: Learning rate. Defaults to `6.25e-5`. :param lr_warmup: Learning rate warmup (percentage of all batches to warmup for). Defaults to `0.002`. :param max_grad_norm: Clip gradients larger than this norm. Defaults to `1.0`. :param accum_steps: Number of updates to accumulate before applying. This is used to simulate a higher batch size. :param lm_loss_coef: Language modeling loss coefficient -- a value between `0.0` - `1.0` that indicates how to trade off between language modeling loss and target model loss. Usually not beneficial to turn on unless dataset size exceeds a few thousand examples. Defaults to `0.0`. :param summarize_grads: Include gradient summary information in tensorboard. Defaults to `False`. :param val_size: Validation set size if int. Validation set size as percentage of all training data if float. Validation will not be run by default if n_examples < 50. If n_examples > 50, defaults to max(5, min(100, 0.05 * n_examples)) :param val_interval: Evaluate on validation set after `val_interval` batches. Defaults to 4 * val_size / batch_size to ensure that too much time is not spent on validation. :param lm_temp: Language model temperature -- a value of `0.0` corresponds to greedy maximum likelihood predictions while a value of `1.0` corresponds to random predictions. Defaults to `0.2`. :param seq_num_heads: Number of attention heads of final attention layer. Defaults to `16`. :param subtoken_predictions: Return predictions at subtoken granularity or token granularity? Defaults to `False`. :param multi_label_sequences: Use a multi-labeling approach to sequence labeling to allow overlapping labels. :param multi_label_threshold: Threshold of sigmoid unit in multi label classifier. Can be increased or lowered to trade off precision / recall. Defaults to `0.5`. :param autosave_path: Save current best model (as measured by validation loss) to this location. Defaults to `None`. :param tensorboard_folder: Directory for tensorboard logs. Tensorboard logs will not be written unless tensorboard_folder is explicitly provided. Defaults to `None`. :param log_device_placement: Log which device each operation is placed on for debugging purposes. Defaults to `False`. :param allow_soft_placement: Allow tf to allocate an operation to a different device if a device is unavailable. Defaults to `True`. :param save_adam_vars: Save adam parameters when calling `model.save()`. Defaults to `True`. :param num_layers_trained: How many layers to finetune. Specifying a value less than 12 will train layers starting from model output. Defaults to `12`. :param train_embeddings: Should embedding layer be finetuned? Defaults to `True`. :param class_weights: One of 'log', 'linear', or 'sqrt'. Auto-scales gradient updates based on class frequency. Can also be a dictionary that maps from true class name to loss coefficient. Defaults to `None`. :param oversample: Should rare classes be oversampled? Defaults to `False`. :param params_device: Which device should gradient updates be aggregated on? If you are using a single GPU and have more than 4Gb of GPU memory you should set this to GPU PCI number (0, 1, 2, etc.). Defaults to `"cpu"`. :param eval_acc: if True, calculates accuracy and writes it to the tensorboard summary files for valudation runs. :param save_dtype: specifies what precision to save model weights with. Defaults to `np.float32`. :param regression_loss: the loss to use for regression models. One of `L1` or `L2`, defaults to `L2`. :param prefit_init: if True, fit target model weigths before finetuning the entire model. Defaults to `False`. :param debugging_logs: if True, output tensorflow logs and turn off TQDM logging. Defaults to `False`. :param val_set: Where it is neccessary to use an explicit validation set, provide it here as a tuple (text, labels) :param per_process_gpu_memory_fraction: fraction of the overall amount of memory that each visible GPU should be allocated, defaults to `1.0`. :param adapter_size: width of adapter module from 'Parameter Efficient Transfer Learning' paper, if defined. defaults to 'None'. """ def get_grid_searchable(self): return self.grid_searchable def __init__(self, **kwargs): super().__init__() self.grid_searchable = {} for key, value in kwargs.items(): self[key] = value def __getattr__(self, attr): if attr.startswith('__'): raise AttributeError if attr == "base_model_path": full_path = finetune_model_path(self["base_model_path"]) if os.path.exists(full_path): return full_path return self[attr] def __setitem__(self, key, value): if isinstance(value, GridSearchable): self.grid_searchable[key] = value.iterator value = value.default return super().__setitem__(key, value) def __setattr__(self, k, v): return self.__setitem__(k, v) __delattr__ = dict.__delitem__
def did_you_mean(keyword, keyword_pool): candidates = list(keyword_pool) closest_match_idx = np.argmin([ edit_distance(keyword, candidate) for candidate in candidates ]) return candidates[closest_match_idx] def assert_valid_config(**kwargs): expected_keys = set(get_default_config().keys()) for kwarg in kwargs: if kwarg not in expected_keys: raise FinetuneError( "Unexpected setting configuration: `{}` is an invalid keyword. " "Did you mean `{}`?".format(kwarg, did_you_mean(kwarg, expected_keys)) ) def get_default_config(): """ Gets a config object containing all the default parameters for each variant of the model. :return: Config object. """ settings = Settings( # General Settings low_memory_mode=False, interpolate_pos_embed=False, save_adam_vars=True, shuffle_buffer_size=100, dataset_size=None, batch_size=2, visible_gpus=None, # defaults to all available n_epochs=GridSearchable(3, [1, 2, 3, 4]), seed=42, max_length=512, weight_stddev=0.02, save_dtype=None, val_set=None, per_process_gpu_memory_fraction=0.95, adapter_size = None, #from Parameter Efficient Transfer Learning paper # Regularization embed_p_drop=0.1, attn_p_drop=0.1, resid_p_drop=0.1, clf_p_drop=0.1, l2_reg=GridSearchable(0.01, [0.0, 0.1, 0.01, 0.001]), vector_l2=False, # Early Stopping and Validation autosave_path=None, keep_best_model=False, early_stopping_steps=None, min_secs_between_eval=60, eval_acc=False, val_size=None, val_interval=None, # Debugging log_device_placement=False, soft_device_placement=True, tensorboard_folder=None, summarize_grads=False, debugging_logs=False, # Partial Fitting num_layers_trained=12, train_embeddings=True, # Class Imbalance class_weights=None, oversample=False, params_device="cpu", # Optimization Params optimizer="AdamW", b1=0.9, b2=0.999, epsilon=1e-8, lr_schedule='warmup_linear', lr=GridSearchable(6.25e-5, [6.25e-4, 6.25e-5, 6.25e-6]), lr_warmup=0.002, max_grad_norm=1.0, prefit_init=False, accum_steps=1, # MTL tasks=None, dont_optimize_zero_gradients=False, # Language Model Settings lm_loss_coef=0.0, lm_temp=0.2, # Sequence Labeling seq_num_heads=16, pad_token="<PAD>", pad_idx=None, subtoken_predictions=False, multi_label_sequences=False, multi_label_threshold=0.5, chunk_long_sequences=False, # Regression Params regression_loss="L2", # Association Params viable_edges=None, association_types=None, assocation_loss_weight=100.0, # Location of model weights base_model=GPTModel, base_model_path=None, # Possible `SourceModel` specific settings n_heads=None, n_layer=None, act_fn=None, n_embed=None, # for TextCNN SourceModel only kernel_sizes=None, num_filters_per_size=None, n_embed_featurizer=None, # needed because the dimensions CNN output are different from the embedding dimensions # BERT only bert_intermediate_size=None ) return settings def get_config(**kwargs): """ Gets a config object containing all the default parameters for each variant of the model. :param **kwargs: Keyword arguments to override default values. :return: Config object. """ assert_valid_config(**kwargs) config = get_default_config() config.base_model = kwargs.get('base_model', config.base_model) config.update(config.base_model.settings) config.update(kwargs) return config def cpu_config(**kwargs): config = get_config(**kwargs) config.visible_gpus = [] config.update(kwargs) return config