Args: input_ids (torch.LongTensor): (batch_size, sequence_length) Indices of input sequence tokens in the vocabulary. labels (torch.LongTensor, optional): _description_. Defaults to None. attention_mask (torch.FloatTensor) (batch_size, sequence_length) Mask to avoid performing attention on padding token indices. output_attentions (bool, optional): Whether or not to return the attentions tensors of all attention layers. Defaults to False. output_hidden_states (bool, optional): Whether or not to return the hidden states of all layers. Defaults to False. return_dict (bool, optional): Whether or not to return a ModelOutput instead of a plain tuple. Defaults to True.
Returns: Union[Tuple[torch.Tensor], CausalLMOutput]: """ # attention_mask转换为 (batch_size, 1, 1, sequence_length)的形状 if attention_mask is not None: attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
if labels is not None: # 我们这里会进行偏移操作 # Shift so that tokens shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens loss_fct = nn.CrossEntropyLoss() loss = loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1) )
if not return_dict: # add hidden states and attention if they are here output = (lm_logits,) + transformer_outputs[1:] # return (loss, output) if loss is not None else output return ((loss,) + output) if loss is not None else output
from tokenizers import Tokenizer from tokenizers.models import BPE from tokenizers.trainers import BpeTrainer from tokenizers.pre_tokenizers import BertPreTokenizer from tokenizers.processors import TemplateProcessing, BertProcessing
from transformers import PreTrainedTokenizerFast from transformers import AutoTokenizer
tokenized_datasets = raw_datasets.map( tokenize_function, batched=True, remove_columns="text", desc="Running tokenizer on every text in dataset", ) # just drop last example tokenized_datasets = tokenized_datasets.filter( lambda example: len(example["input_ids"]) == max_seq_length )
if train_args.from_remote: tokenized_datasets.push_to_hub(f"{train_args.owner}/{train_args.dataset_name}") else: tokenized_datasets.save_to_disk(train_args.dataset_name)
for epoch in range(args.epochs): model.train() p_bar = tqdm(train_dataloader, disable=False) for step, batch in enumerate(p_bar): batch = {k: v.to(device) for k, v in batch.items()} outputs = model(batch["input_ids"], labels=batch["input_ids"]) loss = outputs.loss
total_loss += loss.item()
p_bar.set_description( f"epoch {epoch + 1:2d} (loss={loss.item():5.3f} | global_steps {global_steps:4d} | lr {scheduler.get_last_lr()[0]:.5f} )" ) if gradient_accumulation_steps > 1: loss = loss / gradient_accumulation_steps
if args.use_wandb: wandb.log({"epoch": epoch, "eval_loss:": eval_loss})
if eval_loss best_loss = eval_loss logger.info( f"Saving model to {args.model_name} with best eval loss {eval_loss:.3f}" ) # save to local disk model.save_pretrained(f"{args.model_name}")
torch.cuda.empty_cache()
if early_stopper.step(eval_loss): print(f"Stop from early stopping.") break
if __name__ == "__main__": # run train_tokenizer.py to get tokenizer if train_args.from_remote: tokenizer = AutoTokenizer.from_pretrained( f"{train_args.owner}/{train_args.tokenizer_name}", use_fast=True ) else: tokenizer = AutoTokenizer.from_pretrained( f"{train_args.tokenizer_name}", use_fast=True )