Source code for trlx.trlx

import os
import warnings
from typing import Callable, Dict, Iterable, List, Optional, Tuple

from trlx.data.configs import TRLConfig
from trlx.data.default_configs import (
    default_ilql_config,
    default_ppo_config,
    default_sft_config,
)
from trlx.utils import set_seed
from trlx.utils.loading import get_pipeline, get_trainer


[docs]def train( # noqa: C901 model_path: Optional[str] = None, reward_fn: Optional[Callable[[List[str], List[str], List[str]], List[float]]] = None, dataset: Optional[Iterable[Tuple[str, float]]] = None, samples: Optional[List[str]] = None, rewards: Optional[List[float]] = None, prompts: Optional[List[str]] = None, eval_prompts: Optional[List[str]] = None, metric_fn: Optional[Callable[[List[str], List[str], List[str]], Dict[str, List[float]]]] = None, config: Optional[TRLConfig] = None, stop_sequences: Optional[List[str]] = [], ): """ Runs online, offline reinforcement training or supervised finetuning depending on provided arguments. `reward_fn` and `prompts` are required for online training, `samples` and `rewards` are required for offline training. Args: model_path (`Optional[str]`): Path to either huggingface hub checkpoint or a local directory. config (`Optional[TRLConfig]`): Training configuration object. reward_fn (`Optional[Callable[[List[str], List[str], List[str]], List[float]]]`): A function to rate batches of generated samples. Its required arguments are (`samples`, `prompts`, `outputs`) and the return is a list of scalar rewards per each sample in batch dataset (`List[Union[str, List[str]]], List[float]`): Lists of samples and rewards for offline training. (Use `samples` and `rewards` instead) samples (`List[Union[str, List[str]]]`): List of strings or a list of prompts (questions or environment states) and outputs which are meant to be optimized. In the latter case the following form is expected: (prompt_0: str, output_0: str, prompt_1: str, output_1: str ...). Giving a single string `s` for the sample is a shorthand for (`tokenizer.bos_token`, `s`) rewards (`List[float]`): List of scalar rewards per each sample in `samples`. prompts (`Union[List[str], List[Dict[str, Any]]]`): Prompts to use for generations during online training. If a dict is passed as prompt, it must have a required key `"prompt"`, all the extra keys would be passed along the generation for that prompt as a keyword argument to reward function. eval_prompts (`Union[List[str], List[Dict[str, Any]]]`): Prompts to use for periodical validation of training. metric_fn (`Optional[Callable[[List[str], List[str], List[str]], Dict[str, List[float]]]]`): Function to compute statistics on batches of generated samples. Its arguments are the same as in `reward_fn` (`samples`, `prompts`, `outputs`) but the return is a dictionary of mapping from metric's name to a list of scalar values per each sample in batch. stop_sequences (`Optional[List[str]]`): String sequences to trim generations (both for generating of experience and evaluation) up to its encounter in them. Generations will not contain them and also will also be right-stripped. """ if config is None: warnings.warn( "Passing the `config` argument implicitly is depreciated, use or" "adapt some from `trlx/data/default_configs.py` instead" ) if reward_fn: config = default_ppo_config() elif rewards: config = default_ilql_config() else: config = default_sft_config() set_seed(config.train.seed) if dataset: warnings.warn("the `dataset` argument is being depreciated, split it into `samples` and `rewards` instead") samples, rewards = dataset if model_path: config.model.model_path = model_path trainer = get_trainer(config.train.trainer)( config=config, reward_fn=reward_fn, metric_fn=metric_fn, stop_sequences=stop_sequences, **config.train.trainer_kwargs, ) batch_size = config.train.batch_size * int(os.environ.get("WORLD_SIZE", 1)) max_prompt_length = config.train.seq_length - config.method.gen_kwargs["max_new_tokens"] # Online training against a reward function (e.g. PPO, RFT) if reward_fn: prompts = prompts or [trainer.tokenizer.bos_token] * batch_size if eval_prompts is None: eval_prompts = prompts[:batch_size] pipeline = get_pipeline(config.train.pipeline)( prompts, max_prompt_length, trainer.tokenizer, add_special_tokens=config.model.model_arch_type == "seq2seq" ) trainer.add_prompt_pipeline(pipeline) if eval_prompts is None: eval_prompts = prompts[:batch_size] # Offline training from the collected samples (e.g. SFT, ILQL) elif samples: if rewards is not None: if len(samples) != len(rewards): raise ValueError(f"Number of samples {len(samples)} should match the number of rewards {len(rewards)}") if eval_prompts is None: eval_prompts = [trainer.tokenizer.bos_token] * batch_size if rewards is not None: trainer.make_experience(samples, rewards, config.train.seq_length) else: trainer.make_experience(samples, config.train.seq_length) else: raise ValueError("Either `samples` or `reward_fn` should be given for training") eval_pipeline = get_pipeline(config.train.pipeline)( eval_prompts, max_prompt_length, trainer.tokenizer, add_special_tokens=config.model.model_arch_type == "seq2seq" ) trainer.add_eval_pipeline(eval_pipeline) if config.train.resume_from_checkpoint and os.path.exists(config.train.resume_from_checkpoint): trainer.load(config.train.resume_from_checkpoint) trainer.learn() return trainer