>>> tokenizer = AutoTokenizer.from_pretrained(ckpt) >>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0") >>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded") >>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23) >>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0]) Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
# Launch with `deepspeed deepspeed-zero-inference.py`
import torch import deepspeed import os import time from transformers.deepspeed import HfDeepSpeedConfig from transformers import AutoConfig, AutoTokenizer, AutoModelForCausalLM
# 与HuggingFace共享DeepSpeed配置,以便我们可以正确使用 zero stage 3 加载大模型 hfdsc = HfDeepSpeedConfig(ds_config)
# Load the model and tokenizer tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16 )
defrun_deepspeed_inference(): # Load the model on meta tensors config = AutoConfig.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) with deepspeed.OnDevice(dtype=torch.float16, device="meta", enabled=True): model = AutoModelForCausalLM.from_config(config, torch_dtype=torch.float16)
# Define the checkpoint dict. You may need to convert *.safetensors to # *.bin for this work. Make sure you get all the *.bin and *.pt files in # the checkpoint_files list. checkpoint_dir = "/workspace/models/llama-7b-hf" checkpoint_files = [ os.path.join(checkpoint_dir, f"pytorch_model-{i:05d}-of-00033.bin") # os.path.join(checkpoint_dir, f"model-{i:05d}-of-00033.safetensors") for i in range(1, 34) ] checkpoint_dict = { "type": "DS_MODEL", "checkpoints": checkpoint_files, "version": 1.0, } print(checkpoint_dict)
# This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model # into a single GPU # # 1. Use 1 GPU with CPU offload # 2. Or use multiple GPUs instead # # First you need to install deepspeed: pip install deepspeed # # Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2 # small GPUs can handle it. or 1 small GPU and a lot of CPU memory. # # To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU - # you will need 2-4 gpus. And then you can adapt the script to handle more gpus if you want to # process multiple inputs at once. # # The provided deepspeed config also activates CPU memory offloading, so chances are that if you # have a lot of available CPU memory and you don't mind a slowdown you should be able to load a # model that doesn't normally fit into a single GPU. If you have enough GPU memory the program will # run faster if you don't want offload to CPU - so disable that section then. # # To deploy on 1 gpu: # # deepspeed --num_gpus 1 t0.py # or: # python -m torch.distributed.run --nproc_per_node=1 t0.py # # To deploy on 2 gpus: # # deepspeed --num_gpus 2 deepspeed-inference-zero.py # 既然指定了GPU的ID,那么就不需要再设置“--num_nodes”、“--num_gpus” # deepspeed --include=localhost:6,7 deepspeed-inference-zero.py # or: # python -m torch.distributed.run --nproc_per_node=2 t0.py
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM,AutoModelForCausalLM from transformers.integrations import HfDeepSpeedConfig import deepspeed import os import torch
os.environ["TOKENIZERS_PARALLELISM"] = "false"# To avoid warnings about parallelism in tokenizers
# 批处理大小必须被world_size整除,但可以大于world_size # batch size has to be divisible by world_size, but can be bigger than world_size train_batch_size = 1 * world_size
# ds_config notes # # - enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be # faster. # # - for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g. # all official t5 models are bf16-pretrained # # - set offload_param.device to "none" or completely remove the `offload_param` section if you don't # - want CPU offload # # - if using `offload_param` you can manually finetune stage3_param_persistence_threshold to control # - which params should remain on gpus - the larger the value the smaller the offload size # # For in-depth info on Deepspeed config see # https://huggingface.co/docs/transformers/main/main_classes/deepspeed
# keeping the same format as json for consistency, except it uses lower case for true/false # fmt: off ds_config = { "fp16": { "enabled": False }, "bf16": { "enabled": False }, "zero_optimization": { "stage": 3, "offload_param": { "device": "cpu", "pin_memory": True }, "overlap_comm": True, "contiguous_gradients": True, "reduce_bucket_size": model_hidden_size * model_hidden_size, "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, "stage3_param_persistence_threshold": 10 * model_hidden_size }, "steps_per_print": 2000, "train_batch_size": train_batch_size, "train_micro_batch_size_per_gpu": 1, "wall_clock_breakdown": False } # fmt: on
# next line instructs transformers to partition the model directly over multiple gpus using # deepspeed.zero.Init when model's `from_pretrained` method is called. # # **it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name)**