# Set dimensions. in_features =768 out_features =3072 hidden_size =2048
# Initialize model and inputs. model = te.Linear(in_features, out_features, bias=True) inp = torch.randn(hidden_size, in_features, device="cuda")
# Create an FP8 recipe. Note: All input args are optional. fp8_recipe = recipe.DelayedScaling(margin=0, interval=1, fp8_format=recipe.Format.E4M3)
# Enable autocasting for the forward pass with te.fp8_autocast(enabled=True, fp8_recipe=fp8_recipe): out = model(inp)
loss = out.sum() loss.backward()
2)Accelerate:支持 DDP 和 FSDP 的 FP8 训练
# We prepare fp8 after, allowing for bf16 autocast to happen first ifgetattr(self.fp8_recipe_handler, "backend", None) =="TE": ifnot has_transformer_engine_layers(model): with torch.no_grad(): convert_model(model) model._converted_to_transformer_engine =True
kwargs =self.fp8_recipe_handler.to_kwargs() ifself.fp8_recipe_handler isnotNoneelse {} if"fp8_format"in kwargs: kwargs["fp8_format"] =getattr(te_recipe.Format, kwargs["fp8_format"]) fp8_recipe = te_recipe.DelayedScaling(**kwargs) # If we are in DDP or FSDP, we delay `autocast` until after FSDP/DDP has been initialized # to make use of the process group ifnotself.delayed_fp8_autocast: model.forward = fp8_autocast(enabled=True, fp8_recipe=fp8_recipe)(model.forward)
# define TE model use_te = args.transformer_impl =="transformer_engine" if use_te: transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec() model = GPTModel(transformer_layer_spec=transformer_layer_spec)
# set autocast context classTransformerBlock(MegatronModule): defforward(): ifself.config.fp8: importtransformer_engine# To keep out TE dependency when not training in fp8
ifself.config.fp8 =="e4m3": fp8_format = transformer_engine.common.recipe.Format.E4M3 elifself.config.fp8 =="hybrid": fp8_format = transformer_engine.common.recipe.Format.HYBRID else: raiseValueError("E4M3 and HYBRID are the only supported FP8 formats.")