This is late, but for the benefit of those who are not successful with previous answers, another method I found is to override the evaluate method in the Trainer class in the Transformers library. The idea is to compute evaluations on the training set and add them to the logs. Make sure to combine the eval and train dictionaries into one when returning.
Extend the trainer class and override as follows:
class CTCTrainer(Trainer):
def evaluate(
self,
eval_dataset: Optional[Dataset] = None,
ignore_keys: Optional[List[str]] = None,
metric_key_prefix: str = "eval",
) -> Dict[str, float]:
"""
Run evaluation and returns metrics.
The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
(pass it to the init `compute_metrics` argument).
You can also subclass and override this method to inject custom behavior.
Args:
eval_dataset (`Dataset`, *optional*):
Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns
not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__`
method.
ignore_keys (`Lst[str]`, *optional*):
A list of keys in the output of your model (if it is a dictionary) that should be ignored when
gathering predictions.
metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
"eval_bleu" if the prefix is "eval" (default)
Returns:
A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The
dictionary also contains the epoch number which comes from the training state.
"""
# memory metrics - must set up as early as possible
self._memory_tracker.start()
eval_dataloader = self.get_eval_dataloader(eval_dataset)
train_dataloader = self.get_train_dataloader()
start_time = time.time()
eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
eval_output = eval_loop(
eval_dataloader,
description="Evaluation",
# No point gathering the predictions if there are no metrics, otherwise we defer to
# self.args.prediction_loss_only
prediction_loss_only=True if self.compute_metrics is None else None,
ignore_keys=ignore_keys,
metric_key_prefix=metric_key_prefix,
)
train_output = eval_loop(
train_dataloader,
description='Training Evaluation',
prediction_loss_only=True if self.compute_metrics is None else None,
ignore_keys=ignore_keys,
metric_key_prefix="train",
)
total_batch_size = self.args.eval_batch_size * self.args.world_size
if f"{metric_key_prefix}_jit_compilation_time" in eval_output.metrics:
start_time += eval_output.metrics[f"{metric_key_prefix}_jit_compilation_time"]
eval_output.metrics.update(
speed_metrics(
metric_key_prefix,
start_time,
num_samples=eval_output.num_samples,
num_steps=math.ceil(eval_output.num_samples / total_batch_size),
)
)
train_n_samples = len(self.train_dataset)
train_output.metrics.update(speed_metrics('train', start_time, train_n_samples))
self.log(train_output.metrics | eval_output.metrics)
if DebugOption.TPU_METRICS_DEBUG in self.args.debug:
# tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
xm.master_print(met.metrics_report())
self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, train_output.metrics)
self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, eval_output.metrics)
self._memory_tracker.stop_and_update_metrics(eval_output.metrics)
self._memory_tracker.stop_and_update_metrics(train_output.metrics)
# only works in Python >= 3.9
return train_output.metrics | eval_output.metrics
Remember to use your custom extended class to train your model, trainer = CTCTrainer (args) and trainer.train().
The code above will produce the following output in your log history.
"log_history": [
{
"epoch": 0.67,
"learning_rate": 6.428571428571429e-05,
"loss": 2.1279,
"step": 5
},
{
"epoch": 0.67,
"eval_accuracy": 0.13333334028720856,
"eval_loss": 2.1077311038970947,
"eval_runtime": 10.683,
"eval_samples_per_second": 5.616,
"eval_steps_per_second": 1.404,
"step": 5,
"train_accuracy": 0.13333334028720856,
"train_loss": 2.086669921875,
"train_runtime": 10.683,
"train_samples_per_second": 5.616
}
}
deepcopy(control)
? – Raffinate