From c700c8bb23c9f795664fbe00f6a33920f179bfdf Mon Sep 17 00:00:00 2001 From: Lawrence Date: Wed, 25 Dec 2024 11:25:59 +0800 Subject: [PATCH 1/4] init update: support no image input --- .gitignore | 8 +- examples/qwen2vl/data_7b.json | 2 +- examples/qwen2vl/finetune_qwen2vl_7b.sh | 14 +-- examples/qwen2vl/inference_qwen2vl_7b.json | 9 +- examples/qwen2vl/inference_qwen2vl_7b.sh | 6 +- .../llava_instruct_2_mllm_demo_format.py | 12 ++- .../qwen2vl/qwen2vl_convert_to_mm_ckpt.py | 8 +- inference_vlm.py | 97 ++++++++++++++++++- mindspeed_mm/models/qwen2vl_model.py | 39 ++++---- .../inference/pipeline/qwen2vl_pipeline.py | 13 ++- pretrain_qwen2vl.py | 24 +++-- redis_utils.py | 86 ++++++++++++++++ 12 files changed, 261 insertions(+), 57 deletions(-) create mode 100644 redis_utils.py diff --git a/.gitignore b/.gitignore index 7aa4dbb4..03b38668 100644 --- a/.gitignore +++ b/.gitignore @@ -157,4 +157,10 @@ cython_debug/ /tests/st/run_jsons/ /tests/st/run_logs/ -fusion_result.json \ No newline at end of file +fusion_result.json + +MindSpeed/ +ckpt/ +data/ +dependencies/ +save_dir*/ \ No newline at end of file diff --git a/examples/qwen2vl/data_7b.json b/examples/qwen2vl/data_7b.json index 39b7869f..09cbfd5c 100644 --- a/examples/qwen2vl/data_7b.json +++ b/examples/qwen2vl/data_7b.json @@ -25,7 +25,7 @@ }, "attr": { "system": null, - "images": "images", + "images": null, "videos": null, "messages": "messages", "role_tag": "role", diff --git a/examples/qwen2vl/finetune_qwen2vl_7b.sh b/examples/qwen2vl/finetune_qwen2vl_7b.sh index b65e8ea7..a271be79 100644 --- a/examples/qwen2vl/finetune_qwen2vl_7b.sh +++ b/examples/qwen2vl/finetune_qwen2vl_7b.sh @@ -24,8 +24,10 @@ WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MM_DATA="./examples/qwen2vl/data_7b.json" MM_MODEL="./examples/qwen2vl/model_7b.json" MM_TOOL="./mindspeed_mm/tools/tools.json" -LOAD_PATH="ckpt/Qwen2-VL-7B-Instruct" -SAVE_PATH="save_dir" +LOAD_PATH="./ckpt/mm_path/Qwen2-VL-7B-Instruct" +# timestamp=$(date +"%Y%m%d_%H%M%S") +# SAVE_PATH=$("save_dir/" + $timestamp) +SAVE_PATH="save_dir/" TP=1 PP=4 @@ -68,7 +70,7 @@ GPT_ARGS=" --lr 1.0e-5 \ --lr-decay-style cosine \ --weight-decay 0 \ - --train-iters 10000 \ + --train-iters 100 \ --lr-warmup-fraction 0.1 \ --clip-grad 0.0 \ --adam-beta1 0.9 \ @@ -95,9 +97,9 @@ MM_ARGS=" OUTPUT_ARGS=" --log-interval 1 \ - --save-interval 10000 \ - --eval-interval 10000 \ - --eval-iters 5000 \ + --save-interval 100 \ + --eval-interval 100 \ + --eval-iters 100 \ --save $SAVE_PATH \ " logfile=$(date +%Y%m%d)_$(date +%H%M%S) diff --git a/examples/qwen2vl/inference_qwen2vl_7b.json b/examples/qwen2vl/inference_qwen2vl_7b.json index b357697a..a0120a93 100644 --- a/examples/qwen2vl/inference_qwen2vl_7b.json +++ b/examples/qwen2vl/inference_qwen2vl_7b.json @@ -85,7 +85,7 @@ "tokenizer": { "hub_backend": "hf", "autotokenizer_name": "AutoTokenizer", - "from_pretrained": "ckpt/Qwen2-VL-7B-Instruct", + "from_pretrained": "ckpt/hf_path/Qwen2-VL-7B-Instruct", "local_files_only":false }, "generation_config": { @@ -153,7 +153,8 @@ "max_matching_ngram_size": null, "stop_strings": null }, - "image_processer_path": "ckpt/Qwen2-VL-7B-Instruct/preprocessor_config.json", - "image_path": "examples/qwen2vl/demo.jpeg", - "prompts": "Describe this image and keep it within 100 words." + "image_processer_path": "ckpt/hf_path/Qwen2-VL-7B-Instruct/preprocessor_config.json", + "image_path": "", + "prompts": "Describe this image and keep it within 100 words.", + "temperature": 0 } diff --git a/examples/qwen2vl/inference_qwen2vl_7b.sh b/examples/qwen2vl/inference_qwen2vl_7b.sh index 08096fa9..f5b64867 100644 --- a/examples/qwen2vl/inference_qwen2vl_7b.sh +++ b/examples/qwen2vl/inference_qwen2vl_7b.sh @@ -16,7 +16,7 @@ export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -NPUS_PER_NODE=1 +NPUS_PER_NODE=4 MASTER_ADDR=localhost MASTER_PORT=6000 NNODES=1 @@ -24,10 +24,10 @@ NODE_RANK=0 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MM_MODEL="./examples/qwen2vl/inference_qwen2vl_7b.json" -LOAD_PATH="ckpt/Qwen2-VL-7B-Instruct" +LOAD_PATH="/home/ma-user/work/MindSpeed-MM/save_dir/" TP=1 -PP=1 +PP=4 CP=1 SEQ_LEN=1024 MBS=1 diff --git a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py index 90e43835..4bbd5a75 100644 --- a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py +++ b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py @@ -2,7 +2,8 @@ import json import os import stat -llava_json_path = "./data/llava_instruct_150k.json" +# llava_json_path = "./data/llava_instruct_150k.json" +llava_json_path = "./data/llava_instruct_150k_wo_img.json" mllm_format_json_path = "./data/mllm_format_llava_instruct_data.json" with open(llava_json_path, "r") as f: @@ -10,12 +11,13 @@ with open(llava_json_path, "r") as f: mllm_format_llava_instruct_data = [] for item in info_json: - img_path = os.path.join("./data/COCO2017/train2017", item["image"]) - print(f"img_path: {img_path}") + # img_path = os.path.join("./data/COCO2017/train2017", item["image"]) + img_path = os.path.join("./data/dummy", item["image"]) if not os.path.exists(img_path): continue new_item = { "images": [img_path], + # "images": "", "messages": [] } @@ -29,6 +31,8 @@ for item in info_json: mllm_format_llava_instruct_data.append(new_item) output_json = json.dumps(mllm_format_llava_instruct_data) +if os.path.exists(mllm_format_json_path): + os.remove(mllm_format_json_path) with os.fdopen(os.open(mllm_format_json_path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, stat.S_IWUSR | stat.S_IRUSR), "w") as f: f.write(output_json) -print("finish converting dataset") +print(f"finish converting dataset into {mllm_format_json_path}") diff --git a/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py b/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py index 4daf73dd..b85e6c5f 100644 --- a/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py +++ b/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py @@ -305,9 +305,11 @@ def save_by_pp(_state_dicts, _save_dir, _lastest_checkpointed_iteration='release if __name__ == "__main__": - hg_ckpt_dir = "Qwen2-VL-7B-Instruct" - mm_save_dir = 'ckpt/Qwen2-VL-7B-Instruct' - + # hg_ckpt_dir = "Qwen2-VL-7B-Instruct" + # mm_save_dir = 'ckpt/Qwen2-VL-7B-Instruct' + hg_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-7B-Instruct' # huggingface权重目录 + mm_save_dir = 'ckpt/mm_path/Qwen2-VL-7B-Instruct' # 转换后保存目录 + vit_hidden_size = 1280 vit_attention_heads_num = 16 diff --git a/inference_vlm.py b/inference_vlm.py index d168bfd1..4024f046 100644 --- a/inference_vlm.py +++ b/inference_vlm.py @@ -1,26 +1,113 @@ +import os import torch +from time import time, sleep +from concurrent.futures import ThreadPoolExecutor import mindspeed.megatron_adaptor from megatron.training import get_args from mindspeed_mm.tasks.inference.pipeline import vlm_pipeline_dict from mindspeed_mm.configs.config import mm_extra_args_provider +from redis_utils import RedisManager + + +def inference_and_write(batch_requests, pipeline, redis: RedisManager, model_name: str): + # Extract input data + sys_prompts = [req["sys_prompt"] for req in batch_requests] + queries = [req["query"] for req in batch_requests] + main_ids = [req["message_id"] for req in batch_requests] + + full_queries = sys_prompts[0] + queries[0] + + start_time = time() + outputs = pipeline(prompt=full_queries, return_ids=True) + print(f"Inference time: {time() - start_time}") + + if not isinstance(outputs, list): + outputs = [outputs] + print(outputs) + + def write_to_redis(message_id, output): + while True: + try: + redis.write_data_to_result(model_name, message_id, {"output": output}) + break + except Exception as e: + continue + + # Multithreaded writing to Redis + with ThreadPoolExecutor() as executor: + futures = [ + executor.submit(write_to_redis, message_id, output) + for message_id, output in zip(main_ids, outputs) + ] + + # Wait until all futures are completed + for future in futures: + future.result() + def main(): from megatron.training.initialize import initialize_megatron from mindspeed_mm.configs.config import merge_mm_args - # just inference torch.set_grad_enabled(False) initialize_megatron( - extra_args_provider=mm_extra_args_provider, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'} + extra_args_provider=mm_extra_args_provider, + args_defaults={"tokenizer_type": "GPT2BPETokenizer"}, ) args = get_args() merge_mm_args(args) inference_config = args.mm.model - vlm_pipeline_dict[inference_config.pipeline_class](inference_config)() + + pipeline = vlm_pipeline_dict[inference_config.pipeline_class](inference_config) + + # Redis + assert os.environ[ + "MODEL_NAME" + ], "Environment variable MODEL_NAME was not set. Please set it manually." + model_name = os.environ["MODEL_NAME"] + assert os.environ[ + "REDIS_URL" + ], "Environment variable REDIS_URL was not set. Please set it manually." + redis_url = os.environ["REDIS_URL"] + assert os.environ[ + "REDIS_PORT" + ], "Environment variable REDIS_PORT was not set. Please set it manually." + redis_port = os.environ["REDIS_PORT"] + assert os.environ[ + "REDIS_DB" + ], "Environment variable REDIS_DB was not set. Please set it manually." + redis_db = os.environ["REDIS_DB"] + redis = RedisManager( + host=redis_url, port=redis_port, db=redis_db, model_name=model_name + ) + + batch_size = 1 # Define your batch size + + while True: + try: + batch_requests = redis.fetch_batch_of_requests(batch_size) + except Exception as e: + continue + + if batch_requests: + # Padding to batch size + while len(batch_requests) < batch_size: + batch_requests.append( + { + "query": "", + "sys_prompt": "", + "message_id": "dummy", + } + ) + + inference_and_write(batch_requests, pipeline, redis, model_name) + else: + sleep(0.01) -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + with torch.inference_mode(): + main() diff --git a/mindspeed_mm/models/qwen2vl_model.py b/mindspeed_mm/models/qwen2vl_model.py index 62263777..0828165f 100644 --- a/mindspeed_mm/models/qwen2vl_model.py +++ b/mindspeed_mm/models/qwen2vl_model.py @@ -269,24 +269,26 @@ class Qwen2VLModel(MultiModalModule): causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype) return causal_mask < 0 + def forward( - self, - input_ids: torch.Tensor, - pixel_values: torch.Tensor, - image_grid_thw: torch.Tensor, - attention_mask: torch.Tensor, - labels: Optional[torch.Tensor] = None, - inference_params: Optional[InferenceParams] = None, - decoder_input: Optional[torch.FloatTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - packed_seq_params: Optional[PackedSeqParams] = None, - extra_block_kwargs: Optional[dict] = None, + self, + input_ids: torch.Tensor, + pixel_values: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + inference_params: Optional[InferenceParams] = None, + decoder_input: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + packed_seq_params: Optional[PackedSeqParams] = None, + extra_block_kwargs: Optional[dict] = None, ) -> Union[Dict[str, torch.Tensor], torch.Tensor]: - - if self.add_image_encoder: + + vit_embeds = None + + if self.add_image_encoder and pixel_values is not None: vit_embeds = self.image_encoder(pixel_values, image_grid_thw) vit_embeds = vit_embeds.reshape(-1, 1, vit_embeds.shape[-1]).clone() - output = vit_embeds else: vit_embeds = self.input_tensor @@ -295,9 +297,12 @@ class Qwen2VLModel(MultiModalModule): if self.text_decoder.pre_process: input_embeds = self.text_decoder.embedding(input_ids=input_ids, position_ids=position_ids).clone() input_embeds = input_embeds.transpose(0, 1) - image_mask = torch.eq(input_ids, self.img_context_token_id).unsqueeze(-1).expand_as(input_embeds) - vit_embeds = vit_embeds[:, 0, :] - input_embeds = input_embeds.masked_scatter(image_mask, vit_embeds) + + if vit_embeds is not None: + image_mask = torch.eq(input_ids, self.img_context_token_id).unsqueeze(-1).expand_as(input_embeds) + vit_embeds = vit_embeds[:, 0, :] + input_embeds = input_embeds.masked_scatter(image_mask, vit_embeds) + input_embeds = input_embeds.transpose(0, 1).clone() past_seen_tokens = 0 diff --git a/mindspeed_mm/tasks/inference/pipeline/qwen2vl_pipeline.py b/mindspeed_mm/tasks/inference/pipeline/qwen2vl_pipeline.py index 6cbcb596..57315a0f 100644 --- a/mindspeed_mm/tasks/inference/pipeline/qwen2vl_pipeline.py +++ b/mindspeed_mm/tasks/inference/pipeline/qwen2vl_pipeline.py @@ -40,11 +40,11 @@ class Qwen2VlPipeline(GenerationMixin): inputs = self.prepare_inputs(prompt=prompt, images=image) - if return_ids: - streamer = None - else: + # Use the model as a language model if no valid inputs are generated + if inputs is None: + inputs = {'input_ids': self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.infer_config.device)} - streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True) + streamer = None if return_ids else TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True) generated_ids = self.generate(**inputs, do_sample=True if self.generation_config.temperature > 0 else False, @@ -53,7 +53,7 @@ class Qwen2VlPipeline(GenerationMixin): streamer=streamer) if return_ids and generated_ids is not None: generated_ids = [ - output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids) + output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs['input_ids'], generated_ids) ] out = self.image_processor.tokenizer.batch_decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False @@ -64,6 +64,9 @@ class Qwen2VlPipeline(GenerationMixin): return None def prepare_inputs(self, prompt=None, images=None, messages=None): + if not images and not messages: + return None + if not messages: messages = [[ { diff --git a/pretrain_qwen2vl.py b/pretrain_qwen2vl.py index 98d217b4..4ab73b3a 100644 --- a/pretrain_qwen2vl.py +++ b/pretrain_qwen2vl.py @@ -27,14 +27,18 @@ def model_provider(pre_process=True, post_process=True): vlm_config.pre_process = pre_process vlm_config.post_process = post_process - vlm_config.image_encoder.vision_encoder = get_model_config(vlm_config.image_encoder.vision_encoder) - vlm_config.image_encoder.vision_projector = get_model_config(vlm_config.image_encoder.vision_projector) - vlm_config.text_decoder = get_model_config(vlm_config.text_decoder) + if vlm_config.image_encoder: + vlm_config.image_encoder.vision_encoder = get_model_config(vlm_config.image_encoder.vision_encoder) + vlm_config.image_encoder.vision_projector = get_model_config(vlm_config.image_encoder.vision_projector) + vlm_config.text_decoder = get_model_config(vlm_config.text_decoder) - model = Qwen2VLModel(vlm_config) + model = Qwen2VLModel(vlm_config) - model.freeze(freeze_image_encoder=getattr(vlm_config.image_encoder.vision_encoder, 'freeze', True), \ - freeze_image_projection=getattr(vlm_config.image_encoder.vision_projector, 'freeze', True)) + model.freeze(freeze_image_encoder=getattr(vlm_config.image_encoder.vision_encoder, 'freeze', True), \ + freeze_image_projection=getattr(vlm_config.image_encoder.vision_projector, 'freeze', True)) + else: + vlm_config.text_decoder = get_model_config(vlm_config.text_decoder) + model = Qwen2VLModel(vlm_config) return model @@ -48,8 +52,12 @@ def get_batch(data_iterator): input_ids = batch['input_ids'].to(torch.cuda.current_device()) labels = batch['labels'].to(torch.cuda.current_device()) attention_mask = batch['attention_mask'].to(torch.cuda.current_device()) - pixel_values = batch['pixel_values'].to(torch.cuda.current_device()) - image_grid_thw = batch['image_grid_thw'].to(torch.cuda.current_device()) + if 'pixel_values' in batch and 'image_grid_thw' in batch: + pixel_values = batch['pixel_values'].to(torch.cuda.current_device()) + image_grid_thw = batch['image_grid_thw'].to(torch.cuda.current_device()) + else: + pixel_values = None + image_grid_thw = None batch = { 'input_ids': input_ids, 'labels': labels, diff --git a/redis_utils.py b/redis_utils.py new file mode 100644 index 00000000..3d6eb621 --- /dev/null +++ b/redis_utils.py @@ -0,0 +1,86 @@ +import redis +import json +import time + +from concurrent.futures import ThreadPoolExecutor + + +class RedisManager: + def __init__(self, host, port, db, model_name, ping_interval=10): + self.conn_pool = redis.ConnectionPool(host=host, port=int(port), db=db) + self.last_ping_time = time.time() + self.model_name = model_name + self.lock_prefix = 'lock:' + self.ping_interval = ping_interval + + def _ping(self): + current_time = time.time() + if current_time - self.last_ping_time > self.ping_interval: + redis_conn = redis.Redis(connection_pool=self.conn_pool) + redis_conn.ping() + redis_conn.close() + self.last_ping_time = current_time + + def write_data_to_result(self, model_name, message_id, data): + while True: + try: + self._ping() + break + except Exception as _: + continue + r = redis.Redis(connection_pool=self.conn_pool) + if message_id != "dummy": + r.hset(model_name + "_result", message_id, json.dumps(data)) + r.close() + + def fetch_single_request_without_queue(self): + r = redis.Redis(connection_pool=self.conn_pool) + + request_data = None + + # Directly access the fixed hash + message_ids = r.hkeys(self.model_name) + for message_id in message_ids: + lock_key = self.lock_prefix + message_id.decode("utf-8") + # Try to acquire a lock for the field + if r.setnx(lock_key, 1): # If successfully set, we have acquired the lock + r.expire(lock_key, 10) # Set an expiration time for the lock to avoid deadlocks + try: + result = r.hget(self.model_name, message_id) + if result: + request_data = json.loads(result.decode("utf-8")) + request_data["main_key"] = self.model_name + request_data["message_id"] = message_id.decode("utf-8") + # Delete the field after retrieval + r.hdel(self.model_name, message_id) + finally: + # Release the lock + r.delete(lock_key) + break + + r.close() + + return request_data + + def fetch_batch_of_requests(self, batch_size=4): + while True: + try: + self._ping() + break + except Exception as e: + print("Fail to connect Redis, retry") + continue + + batch = [] + + with ThreadPoolExecutor(max_workers=batch_size) as executor: + futures = [ + executor.submit(self.fetch_single_request_without_queue) for _ in range(batch_size) + ] + + for future in futures: + result = future.result() + if result: + batch.append(result) + + return batch -- Gitee From 5370aaf49ff3af106237e238f7559fc5dc4de32d Mon Sep 17 00:00:00 2001 From: Luo Yiyang Date: Thu, 26 Dec 2024 15:25:26 +0800 Subject: [PATCH 2/4] update --- .../llava_instruct_2_mllm_demo_format.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py index 4bbd5a75..8c3b2d7b 100644 --- a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py +++ b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py @@ -2,8 +2,9 @@ import json import os import stat -# llava_json_path = "./data/llava_instruct_150k.json" -llava_json_path = "./data/llava_instruct_150k_wo_img.json" +# llava_json_path = "./data/llava_instruct_150k_wo_img.json" +# llava_json_path = "./data/ip.json" +llava_json_path = "./data/full_data.json" mllm_format_json_path = "./data/mllm_format_llava_instruct_data.json" with open(llava_json_path, "r") as f: @@ -12,14 +13,20 @@ with open(llava_json_path, "r") as f: mllm_format_llava_instruct_data = [] for item in info_json: # img_path = os.path.join("./data/COCO2017/train2017", item["image"]) - img_path = os.path.join("./data/dummy", item["image"]) + # img_path = os.path.join("./data/dummy", item["image"]) + img_path = os.path.join("./data", item["image"]) if not os.path.exists(img_path): continue - new_item = { - "images": [img_path], - # "images": "", - "messages": [] - } + if not img_path.endswith(".jpg") and not img_path.endswith(".png"): + new_item = { + "images": [], + "messages": [] + } + else: + new_item = { + "images": [img_path], + "messages": [] + } for i, trun in enumerate(item["conversations"]): if trun["from"] == "human": -- Gitee From ab86f3e10c2b02894988b292ac1c8c360ae66bff Mon Sep 17 00:00:00 2001 From: Luo Yiyang Date: Mon, 30 Dec 2024 10:27:35 +0800 Subject: [PATCH 3/4] squash commit --- .gitignore | 8 +- OWNERS | 8 +- README.md | 55 ++- examples/cogvideox/README.md | 114 +++-- .../cogvideox/cogvideox_convert_to_mm_ckpt.py | 188 ++++++-- examples/cogvideox/i2v_1.0/data.json | 45 ++ .../{ => i2v_1.0}/inference_cogvideox_i2v.sh | 2 +- .../{ => i2v_1.0}/inference_model_i2v.json | 1 + .../i2v_1.0/model_cogvideox_i2v.json | 0 .../{ => i2v_1.0}/pretrain_cogvideox_i2v.sh | 10 +- examples/cogvideox/i2v_1.5/data.json | 45 ++ .../model_cogvideox_i2v_1.5.json} | 256 +++++------ .../i2v_1.5/pretrain_cogvideox_i2v.sh | 105 +++++ examples/cogvideox/t2v_1.0/data.json | 45 ++ .../inference_cogvideox_t2v.sh} | 2 +- .../inference_model_t2v.json} | 1 + .../model_cogvideox_t2v.json} | 1 + .../{ => t2v_1.0}/pretrain_cogvideox_t2v.sh | 10 +- examples/cogvideox/t2v_1.5/data.json | 45 ++ .../t2v_1.5/model_cogvideox_t2v_1.5.json | 123 ++++++ .../t2v_1.5/pretrain_cogvideox_t2v_1.5.sh | 105 +++++ examples/diffusers/flux/README.md | 26 +- .../flux/infer_flux_text2img_distrib.py | 63 +++ .../flux/infer_flux_text2img_lora_bf16.py | 1 + examples/diffusers/sd3/README.md | 28 +- .../sd3/infer_sd3_text2img_distrib.py | 75 ++++ examples/diffusers/sdxl/README.md | 38 +- .../sdxl/sdxl_text2img_distrib_infer.py | 72 +++ examples/internvl2/README.md | 82 +++- examples/internvl2/dot_product_attention.py | 179 +++----- examples/internvl2/evaluate_internvl2_8B.sh | 1 - examples/internvl2/finetune_internvl2_2B.sh | 7 +- examples/internvl2/finetune_internvl2_76B.sh | 7 +- examples/internvl2/finetune_internvl2_8B.sh | 7 +- .../internvl2/finetune_internvl2_8B_vpp.sh | 7 +- examples/internvl2/inference_internvl.sh | 7 +- ...to_hg.py => internvl2_convert_mm_to_hf.py} | 49 +-- ...kpt.py => internvl2_convert_to_mm_ckpt.py} | 7 +- examples/llava1.5/README.md | 137 ++---- examples/llava1.5/clip_converter.py | 147 +++++++ examples/llava1.5/evaluate_llava1_5.sh | 2 +- examples/llava1.5/inference_llava1_5.sh | 6 +- examples/llava1.5/pretrain_llava1_5.sh | 10 +- examples/llava1.5/vicuna_converter.py | 135 ++++++ examples/qwen2vl/README.md | 153 ++++--- examples/qwen2vl/data_2b.json | 4 +- examples/qwen2vl/data_72b.json | 4 +- examples/qwen2vl/data_7b.json | 6 +- examples/qwen2vl/dot_product_attention.py | 10 +- examples/qwen2vl/evaluate_qwen2vl_7b.sh | 1 - examples/qwen2vl/finetune_qwen2vl_2b.sh | 3 +- examples/qwen2vl/finetune_qwen2vl_72b.sh | 3 +- examples/qwen2vl/finetune_qwen2vl_7b.sh | 15 +- examples/qwen2vl/inference_qwen2vl_2b.json | 159 +++++++ examples/qwen2vl/inference_qwen2vl_2b.sh | 87 ++++ examples/qwen2vl/inference_qwen2vl_72b.json | 158 +++++++ examples/qwen2vl/inference_qwen2vl_72b.sh | 86 ++++ examples/qwen2vl/inference_qwen2vl_7b.json | 5 +- examples/qwen2vl/inference_qwen2vl_7b.sh | 10 +- .../llava_instruct_2_mllm_demo_format.py | 17 +- examples/qwen2vl/model_2b.json | 4 +- examples/qwen2vl/qwen2vl_convert_pp_to_pp.py | 33 ++ examples/qwen2vl/qwen2vl_convert_to_hf.py | 169 +++---- .../qwen2vl/qwen2vl_convert_to_mm_ckpt.py | 87 ++-- inference_vlm.py | 97 +---- .../models/ae/contextparallelcausalvae.py | 53 ++- mindspeed_mm/models/common/attention.py | 19 +- .../common/embeddings/patch_embeddings.py | 41 +- .../common/embeddings/pos_embeddings.py | 24 +- mindspeed_mm/models/common/normalize.py | 20 +- mindspeed_mm/models/common/updownsample.py | 46 +- .../models/diffusion/cogvideo_diffusion.py | 43 +- .../models/diffusion/diffusers_scheduler.py | 3 + mindspeed_mm/models/predictor/dits/sat_dit.py | 412 +++++++++++++----- .../models/predictor/predict_model.py | 28 ++ mindspeed_mm/models/qwen2vl_model.py | 22 +- mindspeed_mm/models/sora_model.py | 38 +- .../evaluation/eval_datasets/__init__.py | 4 +- .../tasks/evaluation/eval_prompt/__init__.py | 1 - .../eval_prompt/build_prompt_base.py | 4 + .../eval_prompt/build_prompt_internvl.py | 3 - .../inference/pipeline/cogvideox_pipeline.py | 6 +- mindspeed_mm/tools/README.md | 1 + mindspeed_mm/tools/profiler.py | 16 +- mindspeed_mm/tools/tools.json | 3 +- redis_utils.py | 86 ---- .../st/shell_scripts/finetune_internvl2_8B.sh | 7 +- tests/st/shell_scripts/finetune_qwen2vl_7B.sh | 1 - .../shell_scripts/inference_qwen2vl_7b_pp1.sh | 3 - .../shell_scripts/inference_qwen2vl_7b_pp4.sh | 3 - tests/st/shell_scripts/pretrain_llava1_5.sh | 6 +- .../embeddings/test_cogvideox_pos_emb.py | 10 + .../diffusion/test_cogvideo_diffusion.py | 121 +++++ 93 files changed, 3221 insertions(+), 1176 deletions(-) create mode 100644 examples/cogvideox/i2v_1.0/data.json rename examples/cogvideox/{ => i2v_1.0}/inference_cogvideox_i2v.sh (95%) rename examples/cogvideox/{ => i2v_1.0}/inference_model_i2v.json (99%) create mode 100644 examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json rename examples/cogvideox/{ => i2v_1.0}/pretrain_cogvideox_i2v.sh (88%) create mode 100644 examples/cogvideox/i2v_1.5/data.json rename examples/cogvideox/{model_cogvideox_i2v.json => i2v_1.5/model_cogvideox_i2v_1.5.json} (91%) create mode 100644 examples/cogvideox/i2v_1.5/pretrain_cogvideox_i2v.sh create mode 100644 examples/cogvideox/t2v_1.0/data.json rename examples/cogvideox/{inference_cogvideox.sh => t2v_1.0/inference_cogvideox_t2v.sh} (95%) rename examples/cogvideox/{inference_model.json => t2v_1.0/inference_model_t2v.json} (99%) rename examples/cogvideox/{model_cogvideox.json => t2v_1.0/model_cogvideox_t2v.json} (99%) rename examples/cogvideox/{ => t2v_1.0}/pretrain_cogvideox_t2v.sh (88%) create mode 100644 examples/cogvideox/t2v_1.5/data.json create mode 100644 examples/cogvideox/t2v_1.5/model_cogvideox_t2v_1.5.json create mode 100644 examples/cogvideox/t2v_1.5/pretrain_cogvideox_t2v_1.5.sh create mode 100644 examples/diffusers/flux/infer_flux_text2img_distrib.py create mode 100644 examples/diffusers/sd3/infer_sd3_text2img_distrib.py create mode 100644 examples/diffusers/sdxl/sdxl_text2img_distrib_infer.py rename examples/internvl2/{internvl2_convert_mm_to_hg.py => internvl2_convert_mm_to_hf.py} (92%) rename examples/internvl2/{internvl_convert_to_mm_ckpt.py => internvl2_convert_to_mm_ckpt.py} (99%) create mode 100644 examples/llava1.5/clip_converter.py create mode 100644 examples/llava1.5/vicuna_converter.py create mode 100644 examples/qwen2vl/inference_qwen2vl_2b.json create mode 100644 examples/qwen2vl/inference_qwen2vl_2b.sh create mode 100644 examples/qwen2vl/inference_qwen2vl_72b.json create mode 100644 examples/qwen2vl/inference_qwen2vl_72b.sh create mode 100644 examples/qwen2vl/qwen2vl_convert_pp_to_pp.py delete mode 100644 redis_utils.py create mode 100644 tests/ut/models/common/embeddings/test_cogvideox_pos_emb.py create mode 100644 tests/ut/models/diffusion/test_cogvideo_diffusion.py diff --git a/.gitignore b/.gitignore index 03b38668..7aa4dbb4 100644 --- a/.gitignore +++ b/.gitignore @@ -157,10 +157,4 @@ cython_debug/ /tests/st/run_jsons/ /tests/st/run_logs/ -fusion_result.json - -MindSpeed/ -ckpt/ -data/ -dependencies/ -save_dir*/ \ No newline at end of file +fusion_result.json \ No newline at end of file diff --git a/OWNERS b/OWNERS index d1332558..c2a207e7 100644 --- a/OWNERS +++ b/OWNERS @@ -40,4 +40,10 @@ reviewers: - ghoshaw - zzztq - vectorwh -- lu-jinfu1999 \ No newline at end of file +- sunnylee219 +- chenhaihui994 +- mr-lin314 +- hemiuhui +- lu-jinfu1999 +- liyx616 +- xiaoqiao12345 diff --git a/README.md b/README.md index edae6ed9..06ed5f60 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ MindSpeed-MM是面向大规模分布式训练的昇腾多模态大模型套件 ## 🔥🔥🔥Latest News +* [Dec. 19, 2024]: 🎉 MindSpeed-MM生成类模型支持分布式推理 * [Dec. 16, 2024]: 🚀 MindSpeed-MM支持Qihoo-T2X模型 * [Dec. 05, 2024]: 🎉 MindSpeed-MM理解类模型支持Lora微调 * [Dec. 03, 2024]: 🚀 MindSpeed-MM支持SD3.5模型 @@ -39,11 +40,11 @@ MindSpeed-MM是面向大规模分布式训练的昇腾多模态大模型套件 | CogVideoX-T2V | ✔ | | | | CP (Ulysses) | ✔ | ✔ | | | CogVideoX-I2V | ✔ | | | | CP (Ulysses) | ✔ | ✔ | | | Opensora1.2 | | | | | DSP | ✔ | ✔ | | -| OpensoraPlan1.3-T2V | ✔ | ✔ | | | CP (Ulysses) | ✔ | ✔ | | -| OpensoraPlan1.3-I2V | ✔ | ✔ | | | CP (Ulysses) | ✔ | ✔ | | -| InternVL2-2B | | | ✔ | ✔ | | ✔ | ✔ | ✔ | -| InternVL2-8B | | | ✔ | ✔ | | ✔ | ✔ | ✔ | -| InternVL2-76B | | | ✔ | ✔ | | ✔ | ✔ | ✔ | +| OpensoraPlan1.3-T2V | ✔ | ✔ | | ✔ | CP (Ulysses) | ✔ | ✔ | | +| OpensoraPlan1.3-I2V | ✔ | ✔ | | ✔ | CP (Ulysses) | ✔ | ✔ | | +| InternVL2-2B | | | ✔ | ✔ | | ✔ | ✔ | | +| InternVL2-8B | | | ✔ | ✔ | | ✔ | ✔ | | +| InternVL2-76B | | | ✔ | ✔ | | ✔ | ✔ | | | Qwen2VL-2B | | | | ✔ | | ✔ | ✔ | ✔ | | Qwen2VL-7B | | | | ✔ | | ✔ | ✔ | ✔ | | Qwen2VL-72B | | | | ✔ | | ✔ | ✔ | ✔ | @@ -60,6 +61,7 @@ MindSpeed-MM是面向大规模分布式训练的昇腾多模态大模型套件 * Distributed Optimizer: [Zero Redundancy Optimizer](https://arxiv.org/abs/1910.02054) (ZeRO) * Recomputation: Reducing Activation [Recomputation](https://arxiv.org/abs/2205.05198) * LoRA: [Low-Rank Adaptation](https://arxiv.org/abs/2106.09685) + --- ## 研发中的特性与模型 @@ -109,6 +111,8 @@ MindSpeed-MM已发布版本维护策略: Samples per Second 为 (SPS); Frames per Second 为 (FPS); Tokens per Second 为 (TPS) +`亲和场景`为调整少量结构或参数,使得模型更加亲和昇腾,性能更优 + @@ -127,7 +131,7 @@ Samples per Second 为 (SPS); Frames per Second 为 (FPS); Tokens per Second 为 - + @@ -178,23 +182,41 @@ Samples per Second 为 (SPS); Frames per Second 为 (FPS); Tokens per Second 为 - - + + - - + + - - + - - + + + + + + + + + + + + + + + + + + + + + @@ -306,7 +328,7 @@ Samples per Second 为 (SPS); Frames per Second 为 (FPS); Tokens per Second 为 - + @@ -548,7 +570,8 @@ MindSpeed-MM 由华为公司的下列部门联合贡献 : * 华为云 MindSpeed-MM 生态贡献方: -* 奇虎360 + +* 360 AI Research 感谢来自社区的每一个PR,欢迎贡献 MindSpeed-MM diff --git a/examples/cogvideox/README.md b/examples/cogvideox/README.md index 4f7b7ad1..8880879c 100644 --- a/examples/cogvideox/README.md +++ b/examples/cogvideox/README.md @@ -82,13 +82,13 @@ #### 仓库拉取 ```shell - git clone https://gitee.com/ascend/MindSpeed-MM.git - git clone https://github.com/NVIDIA/Megatron-LM.git - cd Megatron-LM - git checkout core_r0.6.0 - cp -r megatron ../MindSpeed-MM/ - cd .. - cd MindSpeed-MM +git clone https://gitee.com/ascend/MindSpeed-MM.git +git clone https://github.com/NVIDIA/Megatron-LM.git +cd Megatron-LM +git checkout core_r0.6.0 +cp -r megatron ../MindSpeed-MM/ +cd .. +cd MindSpeed-MM ``` #### 环境搭建 @@ -97,31 +97,31 @@ torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software) ```bash - # python3.10 - conda create -n test python=3.10 - conda activate test - - # 安装 torch 和 torch_npu,注意要选择对应python版本、x86或arm的torch、torch_npu及apex包 - pip install torch-2.1.0-cp310-cp310m-manylinux2014_aarch64.whl - pip install torch_npu-2.1.0*-cp310-cp310m-linux_aarch64.whl - - # apex for Ascend 参考 https://gitee.com/ascend/apex - pip install apex-0.1_ascend*-cp310-cp310m-linux_aarch64.whl - - # 将shell脚本中的环境变量路径修改为真实路径,下面为参考路径 - source /usr/local/Ascend/ascend-toolkit/set_env.sh - - # 安装加速库 - git clone https://gitee.com/ascend/MindSpeed.git - cd MindSpeed - # checkout commit from MindSpeed core_r0.6.0 - git checkout 5dc1e83b - pip install -r requirements.txt - pip3 install -e . - cd .. - - # 安装其余依赖库 - pip install -e . +# python3.10 +conda create -n test python=3.10 +conda activate test + +# 安装 torch 和 torch_npu,注意要选择对应python版本、x86或arm的torch、torch_npu及apex包 +pip install torch-2.1.0-cp310-cp310m-manylinux2014_aarch64.whl +pip install torch_npu-2.1.0*-cp310-cp310m-linux_aarch64.whl + +# apex for Ascend 参考 https://gitee.com/ascend/apex +pip install apex-0.1_ascend*-cp310-cp310m-linux_aarch64.whl + +# 将shell脚本中的环境变量路径修改为真实路径,下面为参考路径 +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +# 安装加速库 +git clone https://gitee.com/ascend/MindSpeed.git +cd MindSpeed +# checkout commit from MindSpeed core_r0.6.0 +git checkout 5dc1e83b +pip install -r requirements.txt +pip install -e . +cd .. + +# 安装其余依赖库 +pip install -e . ``` #### Decord搭建 @@ -220,27 +220,31 @@ data.jsonl文件内容如下示例: #### 配置参数 -需根据实际任务情况修改`model_cogvideox.json`、`model_cogvideox_i2v.json`和`data.json`中的权重和数据集路径,包括`from_pretrained`、`data_path`、`data_folder`字段。 +需根据实际任务情况修改`model_cogvideox_t2v_t2v.json`、`model_cogvideox_i2v.json`和`data.json`中的权重和数据集路径,包括`from_pretrained`、`data_path`、`data_folder`字段。 -在sh启动脚本中可以修改运行卡数: +`model_cogvideox_t2v.json`/`model_cogvideox_i2v.json`文件中的`head_dim`字段原模型默认配置为64。此字段调整为128会更加亲和昇腾。 + +`model_cogvideox_t2v.json`/`model_cogvideox_i2v.json`文件中的`head_dim`字段原模型默认配置为64。此字段调整为128会更加亲和昇腾。 + +在sh启动脚本中可以修改运行卡数(NNODES为节点数,GPUS_PER_NODE为每个节点的卡数,相乘即为总运行卡数): ```shell - GPUS_PER_NODE=8 - MASTER_ADDR=locahost - MASTER_PORT=29501 - NNODES=1 - NODE_RANK=0 - WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES)) +GPUS_PER_NODE=8 +MASTER_ADDR=locahost +MASTER_PORT=29501 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES)) ``` #### 启动预训练 t2v任务启动预训练 ```shell - bash examples/cogvideox/pretrain_cogvideox_t2v.sh +bash examples/cogvideox/t2v_1.0/pretrain_cogvideox_t2v.sh ``` i2v任务启动预训练 ```shell - bash examples/cogvideox/pretrain_cogvideox_i2v.sh +bash examples/cogvideox/i2v_1.0/pretrain_cogvideox_i2v.sh ``` --- @@ -255,12 +259,20 @@ i2v任务启动预训练 #### 配置参数 -检查如下配置是否完成 +检查对应配置是否完成 + +| t2v配置文件 | 修改字段 | 修改说明 | +|---------------------------------------------------|:--------------------------------:|:-----------------------------------:| +| examples/cogvideox/t2v_*/inference_model_t2v.json | from_pretrained | 修改为下载的权重所对应路径 | +| examples/cogvideox/samples_prompts.txt | 文件内容 | 可自定义自己的prompt,一行为一个prompt | + + +| i2v配置文件 | 修改字段 | 修改说明 | +|---------------------------------------------------|:--------------------------------:|:----------------:| +| examples/cogvideox/i2v_*/inference_model_i2v.json | from_pretrained | 修改为下载的权重所对应路径 | +| examples/cogvideox/samples_i2v_images.txt | 文件内容 | 图片路径 | +| examples/cogvideox/samples_i2v_prompts.txt | 文件内容 | 自定义prompt | -| 配置文件 | 修改字段 | 修改说明 | -|------|:--------------------------------:|:-----------------------------------:| -| examples/cogvideox/inference_model.json | from_pretrained | 修改为下载的权重所对应路径 | -| examples/cogvideox/samples_prompts.txt | 文件内容 | 可自定义自己的prompt,一行为一个prompt | 如果使用训练后保存的权重进行推理,需要使用脚本进行转换,权重转换source_path参数请配置训练时的保存路径 ```bash @@ -269,9 +281,15 @@ python examples/cogvideox/cogvideox_convert_to_mm_ckpt.py --source_path #### 启动推理 +t2v 启动推理脚本 + +```bash +bash examples/cogvideox/t2v_1.0/inference_cogvideox_t2v.sh +``` +i2v 启动推理脚本 ```bash -bash examples/cogvideox/inference_cogvideox.sh +bash examples/cogvideox/i2v_1.0/inference_cogvideox_i2v.sh ``` --- diff --git a/examples/cogvideox/cogvideox_convert_to_mm_ckpt.py b/examples/cogvideox/cogvideox_convert_to_mm_ckpt.py index 82dd673f..28dad428 100644 --- a/examples/cogvideox/cogvideox_convert_to_mm_ckpt.py +++ b/examples/cogvideox/cogvideox_convert_to_mm_ckpt.py @@ -28,6 +28,16 @@ CONVERT_MAPPING = { "mixins.final_layer.adaLN_modulation.1.bias": "adaLN_modulation.1.bias" } +first_pipeline_stage_keys = ["time_embed.time_embed.0.bias", "time_embed.time_embed.0.weight", + "time_embed.time_embed.2.bias", "time_embed.time_embed.2.weight", + "patch_embed.proj.bias", "patch_embed.proj.weight", + "caption_projection.bias", "caption_projection.weight"] + +last_pipeline_stage_keys = ["norm_final.weight", "norm_final.bias", + "norm_out.weight", "norm_out.bias", + "proj_out.weight", "proj_out.bias", + "adaLN_modulation.1.weight", "adaLN_modulation.1.bias"] + def update_state_dict_inplace( state_dict: Dict[str, Any], @@ -108,12 +118,60 @@ def split_by_tp(state_dict: Dict[str, Any], tp_size: int = 2, num_layers: int = wv = torch.chunk(wv, tp_size, dim=0)[tp_rank] weight = torch.cat([wq, wk, wv], dim=0) new_state_dict[split_name] = weight + # adaLN modulation + col_split_names = [ + "adaLN_modulation.1.weight", + "adaLN_modulation.1.bias", + ] + for split_name in col_split_names: + new_state_dict[split_name] = torch.chunk(state_dict[split_name], tp_size, dim=0)[tp_rank] + # adaLN modulation + col_split_names = [ + "adaLN_modulation.1.weight", + "adaLN_modulation.1.bias", + ] + for split_name in col_split_names: + new_state_dict[split_name] = torch.chunk(state_dict[split_name], tp_size, dim=0)[tp_rank] new_state_dicts.append(new_state_dict) return new_state_dicts -def save_by_tp(state_dicts: List[Dict], save_dir: str, latest_checkpointed_iteration='release'): +def split_by_pp(state_dicts: List[Dict[str, Any]], pp_sizes: List, remove_pos_emb: bool = False) -> Dict[tuple, Dict]: + if len(pp_sizes) == 1: + new_state_dicts = {} + for tp_rank, state_dict in enumerate(state_dicts): + new_state_dicts[(0, tp_rank)] = state_dict + return new_state_dicts + + new_state_dicts = {} + for pp_rank, num_layers in enumerate(pp_sizes): + start_layer_index, end_layer_index = sum(pp_sizes[:pp_rank]), sum(pp_sizes[:pp_rank + 1]) + is_pipeline_first_stage = pp_rank == 0 + is_pipeline_last_stage = pp_rank == len(pp_sizes) - 1 + + for tp_rank, state_dict in enumerate(state_dicts): + pp_tp_param = dict() + + for i in range(start_layer_index, end_layer_index): + layer_names = get_layer_mapping(i).values() + pp_layer_names = get_layer_mapping(i - start_layer_index).values() + + for pp_layer_name, layer_name in zip(pp_layer_names, layer_names): + pp_tp_param[pp_layer_name] = state_dict[layer_name] + + if is_pipeline_first_stage: + for layer_name in first_pipeline_stage_keys: + pp_tp_param[layer_name] = state_dict[layer_name] + if is_pipeline_last_stage: + for layer_name in last_pipeline_stage_keys: + pp_tp_param[layer_name] = state_dict[layer_name] + new_state_dicts[(pp_rank, tp_rank)] = pp_tp_param + + return new_state_dicts + + +def save_by_tp_pp(state_dicts: Dict[tuple, Dict], save_dir: str, enable_pp: bool, latest_checkpointed_iteration='release'): if not os.path.exists(save_dir): os.makedirs(save_dir) @@ -126,36 +184,24 @@ def save_by_tp(state_dicts: List[Dict], save_dir: str, latest_checkpointed_itera else: directory = 'iter_{:07d}'.format(latest_checkpointed_iteration) - for tp_rank, state_dict in enumerate(state_dicts): - os.makedirs(os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}")) - save_path = os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}", "model_optim_rng.pt") + for (pp_rank, tp_rank), state_dict in state_dicts.items(): + if enable_pp: + os.makedirs(os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}_{pp_rank:03d}")) + save_path = os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}_{pp_rank:03d}", "model_optim_rng.pt") + else: + os.makedirs(os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}")) + save_path = os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}", "model_optim_rng.pt") save_dict = {} save_dict['model'] = state_dict torch.save(save_dict, save_path) -def merge_by_tp(train_save_dir: str, save_path: str, num_layers: int, tp_size: int): - flags = os.O_RDONLY - mode = stat.S_IRUSR - with os.fdopen(os.open(os.path.join(train_save_dir, "latest_checkpointed_iteration.txt"), flags, mode)) as f: - latest_checkpointed_iteration = f.readline() - - if latest_checkpointed_iteration == 'release': - directory = 'release' - else: - directory = 'iter_{:07d}'.format(latest_checkpointed_iteration) - - _state_dicts = [] - for tp_rank in range(tp_size): - state_dict_path = os.path.join(train_save_dir, directory, f"mp_rank_{tp_rank:02d}", "model_optim_rng.pt") - _state_dicts.append(torch.load(state_dict_path)['model']) - +def merge_by_tp(state_dicts: Dict[str, Any], num_layers: int, tp_size: int, is_last_pp_stage: bool): if tp_size == 1: - torch.save(_state_dicts[0], save_path) - return + return state_dicts[0] - merged_state_dict = copy.deepcopy(_state_dicts[0]) - for index in num_layers: + merged_state_dict = copy.deepcopy(state_dicts[0]) + for index in range(range(num_layers)): # ColumnParallelLinear suffixed_0 = [ f"videodit_blocks.{index}.ff.net.0.proj.weight", @@ -174,22 +220,84 @@ def merge_by_tp(train_save_dir: str, save_path: str, num_layers: int, tp_size: i f"videodit_blocks.{index}.self_atten.proj_qkv.bias" ] for name in suffixed_0: - parameters = [_state_dicts[tp_rank][name] for tp_rank in range(tp_size)] + parameters = [state_dicts[tp_rank][name] for tp_rank in range(tp_size)] parameters = torch.cat(parameters, dim=0) merged_state_dict[name] = parameters for name in suffixed_1: - parameters = [_state_dicts[tp_rank][name] for tp_rank in range(tp_size)] + parameters = [state_dicts[tp_rank][name] for tp_rank in range(tp_size)] parameters = torch.cat(parameters, dim=1) merged_state_dict[name] = parameters for name in suffixed_special: - wq = [torch.chunk(_state_dicts[tp_rank][name], 3, dim=0)[0] for tp_rank in range(tp_size)] - wk = [torch.chunk(_state_dicts[tp_rank][name], 3, dim=0)[1] for tp_rank in range(tp_size)] - wv = [torch.chunk(_state_dicts[tp_rank][name], 3, dim=0)[2] for tp_rank in range(tp_size)] + wq = [torch.chunk(state_dicts[tp_rank][name], 3, dim=0)[0] for tp_rank in range(tp_size)] + wk = [torch.chunk(state_dicts[tp_rank][name], 3, dim=0)[1] for tp_rank in range(tp_size)] + wv = [torch.chunk(state_dicts[tp_rank][name], 3, dim=0)[2] for tp_rank in range(tp_size)] wq = torch.cat(wq, dim=0) wk = torch.cat(wk, dim=0) wv = torch.cat(wv, dim=0) wqkv = torch.cat([wq, wk, wv], dim=0) merged_state_dict[name] = wqkv + + if is_last_pp_stage: + # adaLN modulation + col_split_names = [ + "adaLN_modulation.1.weight", + "adaLN_modulation.1.bias", + ] + for split_name in col_split_names: + merged_state_dict[split_name] = torch.cat([state_dicts[tp_rank][split_name] for tp_rank in range(tp_size)]) + return merged_state_dict + + +def merge_by_pp(state_dicts: Dict[str, Any], pp_sizes: list): + if len(pp_sizes) == 1: + return state_dicts[0] + + merged_state_dict = {} + for key in first_pipeline_stage_keys: + merged_state_dict[key] = state_dicts[0][key] + for i, pp_size in enumerate(pp_sizes): + for layer_index in range(pp_size): + pp_layer_names = get_layer_mapping(layer_index).values() + layer_names = get_layer_mapping(layer_index + sum(pp_sizes[:i])).values() + for pp_layer_name, layer_name in zip(pp_layer_names, layer_names): + merged_state_dict[layer_name] = state_dicts[i][pp_layer_name] + for key in last_pipeline_stage_keys: + merged_state_dict[key] = state_dicts[-1][key] + return merged_state_dict + + +def merge_by_tp_pp(train_save_dir: str, save_path: str, tp_size: int, pp_sizes: list): + flags = os.O_RDONLY + mode = stat.S_IRUSR + with os.fdopen(os.open(os.path.join(train_save_dir, "latest_checkpointed_iteration.txt"), flags, mode)) as f: + latest_checkpointed_iteration = f.readline() + + if latest_checkpointed_iteration == 'release': + directory = 'release' + else: + directory = 'iter_{:07d}'.format(latest_checkpointed_iteration) + + _pp_state_dicts = [] + for pp_rank, pp_size in enumerate(pp_sizes): + _tp_state_dicts = [] + for tp_rank in range(tp_size): + if len(pp_sizes) > 1: + state_dict_path = os.path.join(train_save_dir, directory, f"mp_rank_{tp_rank:02d}_{pp_rank:03d}", "model_optim_rng.pt") + else: + state_dict_path = os.path.join(train_save_dir, directory, f"mp_rank_{tp_rank:02d}", "model_optim_rng.pt") + _tp_state_dicts.append(torch.load(state_dict_path)['model']) + is_last_pp_stage = pp_rank == len(pp_sizes) - 1 + merged_tp_state_dict = merge_by_tp(_tp_state_dicts, num_layers=pp_sizes[pp_rank], tp_size=tp_size, is_last_pp_stage=is_last_pp_stage) + _pp_state_dicts.append(merged_tp_state_dict) + merged_state_dict = merge_by_pp(_pp_state_dicts, pp_sizes=pp_sizes) + + # adaLN modulation + col_split_names = [ + "adaLN_modulation.1.weight", + "adaLN_modulation.1.bias", + ] + for split_name in col_split_names: + merged_state_dict[split_name] = torch.cat([_state_dicts[tp_rank][split_name] for tp_rank in range(tp_size)]) torch.save(merged_state_dict, save_path) return @@ -197,11 +305,12 @@ def merge_by_tp(train_save_dir: str, save_path: str, num_layers: int, tp_size: i def get_args(): parser = argparse.ArgumentParser() parser.add_argument("--tp_size", type=int, default=2, help="Tensor model parallel world size") + parser.add_argument("--pp_sizes", type=int, nargs='+', help="Pipeline parallel model split sizes") parser.add_argument("--num_layers", type=int, default=42, help="Layer numbers of video_dit") parser.add_argument("--source_path", type=str, default="./transformer/1/mp_rank_00_model_states.pt", help="Source path of checkpoint") parser.add_argument("--target_path", type=str, default="./ckpt/sat_dit/", help="Save path of MM checkpoint") parser.add_argument("--task", type=str, default="t2v", choices=["t2v", "i2v"], help="Task type") - parser.add_argument("--remove_pos_emb", type=bool, default=False, help="remove_pos_emb") + parser.add_argument("--remove_pos_emb", action="store_true", help="remove_pos_emb") parser.add_argument("--mode", type=str, default="split", choices=["split", "merge"], help="Split mode is used to split the pretrained weights according to tp_size before training, \ and Merge mode is used to merge weights based on tp_size after training is completed") @@ -228,12 +337,25 @@ if __name__ == "__main__": remove_layers(source_state_dict, remove_keys) if args.remove_pos_emb: - remove_layers(source_state_dict, ["pos_embed.freq_cos", "pos_embed.freq_sin"]) + remove_layers(source_state_dict, ["pos_embed.freqs_cos", "pos_embed.freqs_sin"]) if args.task == "i2v": remove_layers(source_state_dict, ["pos_embed.pos_embedding"]) + else: + first_pipeline_stage_keys.append("pos_embed.freqs_cos") + first_pipeline_stage_keys.append("pos_embed.freqs_sin") + if args.task == "i2v": + first_pipeline_stage_keys.append("pos_embed.pos_embedding") + + if sum(args.pp_sizes) != args.num_layers: + raise ValueError(f"The sum of args.pp_sizes {args.pp_sizes} must be equal to args.num_layers {args.num_layers}") state_dicts = split_by_tp(source_state_dict, tp_size=args.tp_size, num_layers=args.num_layers) - save_by_tp(state_dicts, args.target_path) + state_dicts = split_by_pp(state_dicts, pp_sizes=args.pp_sizes, remove_pos_emb=args.remove_pos_emb) + save_by_tp_pp(state_dicts, args.target_path, enable_pp=len(args.pp_sizes) > 1) elif args.mode == 'merge': - merge_by_tp(args.source_path, args.target_path, args.num_layers, args.tp_size) \ No newline at end of file + first_pipeline_stage_keys.append("pos_embed.freqs_cos") + first_pipeline_stage_keys.append("pos_embed.freqs_sin") + if args.task == "i2v": + first_pipeline_stage_keys.append("pos_embed.pos_embedding") + merge_by_tp_pp(args.source_path, args.target_path, tp_size=args.tp_size, pp_sizes=args.pp_sizes) \ No newline at end of file diff --git a/examples/cogvideox/i2v_1.0/data.json b/examples/cogvideox/i2v_1.0/data.json new file mode 100644 index 00000000..6ecd74aa --- /dev/null +++ b/examples/cogvideox/i2v_1.0/data.json @@ -0,0 +1,45 @@ +{ + "dataset_param": { + "dataset_type": "t2v", + "use_feature_data": false, + "basic_parameters": { + "data_path": "/data_path/data.jsonl", + "data_folder": "/data_path", + "data_storage_mode": "standard" + }, + "preprocess_parameters": { + "data_process_type": "CogvideoX", + "video_reader_type": "decoder", + "fps": 8, + "skip_frame_num": 3, + "num_frames": 25, + "max_height": 480, + "max_width": 720, + "dataloader_num_workers": 8, + "train_pipeline": { + "video": [], + "image": [] + } + }, + "use_text_processer": true, + "enable_text_preprocessing": false, + "model_max_length": 226, + "tokenizer_config": { + "hub_backend": "hf", + "autotokenizer_name": "T5Tokenizer", + "from_pretrained": "5b-cogvideo/tokenizer" + } + }, + "dataloader_param": { + "dataloader_mode": "sampler", + "sampler_type": "SequentialSampler", + "batch_size": 1, + "num_workers": 8, + "shuffle": true, + "drop_last": true, + "pin_memory": true, + "group_frame": false, + "group_resolution": false, + "collate_param": {} + } +} \ No newline at end of file diff --git a/examples/cogvideox/inference_cogvideox_i2v.sh b/examples/cogvideox/i2v_1.0/inference_cogvideox_i2v.sh similarity index 95% rename from examples/cogvideox/inference_cogvideox_i2v.sh rename to examples/cogvideox/i2v_1.0/inference_cogvideox_i2v.sh index 72b98cc6..9f35528e 100644 --- a/examples/cogvideox/inference_cogvideox_i2v.sh +++ b/examples/cogvideox/i2v_1.0/inference_cogvideox_i2v.sh @@ -14,7 +14,7 @@ CP=1 MBS=1 GBS=$(($WORLD_SIZE*$MBS/$CP/$TP)) -MM_MODEL="examples/cogvideox/inference_model_i2v.json" +MM_MODEL="examples/cogvideox/i2v_1.0/inference_model_i2v.json" LOAD_PATH="your_converted_dit_ckpt_dir" DISTRIBUTED_ARGS=" diff --git a/examples/cogvideox/inference_model_i2v.json b/examples/cogvideox/i2v_1.0/inference_model_i2v.json similarity index 99% rename from examples/cogvideox/inference_model_i2v.json rename to examples/cogvideox/i2v_1.0/inference_model_i2v.json index 069e10f1..d7370c59 100644 --- a/examples/cogvideox/inference_model_i2v.json +++ b/examples/cogvideox/i2v_1.0/inference_model_i2v.json @@ -85,6 +85,7 @@ "cross_attention_dim": null, "attention_bias": true, "input_size": [13, 60, 90], + "patch_type": "2D", "patch_size": [1, 2, 2], "activation_fn": "gelu-approximate", "num_embeds_ada_norm": 1000, diff --git a/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json b/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json new file mode 100644 index 00000000..e69de29b diff --git a/examples/cogvideox/pretrain_cogvideox_i2v.sh b/examples/cogvideox/i2v_1.0/pretrain_cogvideox_i2v.sh similarity index 88% rename from examples/cogvideox/pretrain_cogvideox_i2v.sh rename to examples/cogvideox/i2v_1.0/pretrain_cogvideox_i2v.sh index 6ad9b68e..7ecc7157 100644 --- a/examples/cogvideox/pretrain_cogvideox_i2v.sh +++ b/examples/cogvideox/i2v_1.0/pretrain_cogvideox_i2v.sh @@ -7,7 +7,7 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export CUDA_DEVICE_MAX_CONNECTIONS=1 - +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True GPUS_PER_NODE=8 MASTER_ADDR=localhost MASTER_PORT=29505 @@ -21,8 +21,8 @@ CP=1 MBS=1 GBS=$(($WORLD_SIZE*$MBS/$CP)) -MM_DATA="./examples/cogvideox/data.json" -MM_MODEL="./examples/cogvideox/model_cogvideox_i2v.json" +MM_DATA="./examples/cogvideox/i2v_1.0/data.json" +MM_MODEL="./examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json" MM_TOOL="./mindspeed_mm/tools/tools.json" DISTRIBUTED_ARGS=" @@ -100,5 +100,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_sora.py \ chmod 440 logs/train_${logfile}.log STEP_TIME=`grep "elapsed time per iteration" logs/train_${logfile}.log | awk -F ':' '{print$5}' | awk -F '|' '{print$1}' | head -n 200 | tail -n 100 | awk '{sum+=$1} END {if (NR != 0) printf("%.1f",sum/NR)}'` -FPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'` -echo "Elapsed Time Per iteration: $STEP_TIME, Average FPS: $FPS" \ No newline at end of file +SPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'` +echo "Elapsed Time Per iteration: $STEP_TIME, Average Samples per Second: $SPS" \ No newline at end of file diff --git a/examples/cogvideox/i2v_1.5/data.json b/examples/cogvideox/i2v_1.5/data.json new file mode 100644 index 00000000..b15c5316 --- /dev/null +++ b/examples/cogvideox/i2v_1.5/data.json @@ -0,0 +1,45 @@ +{ + "dataset_param": { + "dataset_type": "t2v", + "use_feature_data": false, + "basic_parameters": { + "data_path": "/data_path/data.jsonl", + "data_folder": "/data_path", + "data_storage_mode": "standard" + }, + "preprocess_parameters": { + "data_process_type": "CogvideoX", + "video_reader_type": "decoder", + "fps": 8, + "skip_frame_num": 3, + "num_frames": 21, + "max_height": 768, + "max_width": 1360, + "dataloader_num_workers": 8, + "train_pipeline": { + "video": [], + "image": [] + } + }, + "use_text_processer": true, + "enable_text_preprocessing": false, + "model_max_length": 224, + "tokenizer_config": { + "hub_backend": "hf", + "autotokenizer_name": "T5Tokenizer", + "from_pretrained": "5b-cogvideo/tokenizer" + } + }, + "dataloader_param": { + "dataloader_mode": "sampler", + "sampler_type": "SequentialSampler", + "batch_size": 1, + "num_workers": 8, + "shuffle": true, + "drop_last": true, + "pin_memory": true, + "group_frame": false, + "group_resolution": false, + "collate_param": {} + } +} \ No newline at end of file diff --git a/examples/cogvideox/model_cogvideox_i2v.json b/examples/cogvideox/i2v_1.5/model_cogvideox_i2v_1.5.json similarity index 91% rename from examples/cogvideox/model_cogvideox_i2v.json rename to examples/cogvideox/i2v_1.5/model_cogvideox_i2v_1.5.json index 178672f8..df278af3 100644 --- a/examples/cogvideox/model_cogvideox_i2v.json +++ b/examples/cogvideox/i2v_1.5/model_cogvideox_i2v_1.5.json @@ -1,128 +1,130 @@ -{ - "frames": 25, - "resolution": [480, 720], - "allow_tf32": true, - "allow_internal_format":false, - "load_video_features": false, - "load_text_features": false, - "task": "i2v", - "predictor": { - "model_id": "satdit", - "from_pretrained": null, - "dtype": "bf16", - "num_layers": 42, - "num_heads": 48, - "head_dim": 64, - "in_channels": 32, - "out_channels": 16, - "dropout": 0.0, - "cross_attention_dim": null, - "attention_bias": true, - "input_size": [13, 60, 90], - "patch_size": [1, 2, 2], - "activation_fn": "gelu-approximate", - "num_embeds_ada_norm": 1000, - "norm_type": "qk_ln", - "norm_elementwise_affine": true, - "norm_eps": 1e-5, - "caption_channels": null, - "time_embed_dim": 512, - "text_length": 226, - "text_hidden_size": 4096, - "concat_text_embed": true, - "interpolation_scale": [1.0, 1.0, 1.0], - "learnable_pos_embed": true, - "use_rope": true - }, - "diffusion": { - "model_id": "cogvideo_diffusion", - "sigma_sampler_config": { - "uniform_sampling": true, - "num_idx": 1000, - "discretization_config":{ - "shift_scale": 1.0 - } - }, - "denoiser_config": { - "num_idx": 1000, - "quantize_c_noise": false, - "discretization_config":{ - "shift_scale": 1.0 - } - } - }, - "text_encoder": { - "model_id": "T5", - "hub_backend": "hf", - "from_pretrained": "5b-cogvideo", - "dtype": "bf16", - "load_in_8bit": false, - "low_cpu_mem_usage": true, - "ucg_rate": 0.1 - }, - "ae": { - "model_id": "contextparallelcasualvae", - "from_pretrained": "3d-vae.pt", - "cp_size": 1, - "dtype": "bf16", - "z_channels": 16, - "conv_padding": 0, - "num_res_blocks": 3, - "hidden_size_mult": [1,2,2,4], - "encoder_attention": "", - "encoder_nonlinearity": "swish", - "encoder_conv_in": "ContextParallelCausalConv3d", - "encoder_conv_out": "ContextParallelCausalConv3d", - "encoder_mid_resnet": "ContextParallelResnetBlock3D", - "encoder_resnet_blocks": [ - "ContextParallelResnetBlock3D", - "ContextParallelResnetBlock3D", - "ContextParallelResnetBlock3D", - "ContextParallelResnetBlock3D" - ], - "encoder_spatial_downsample": [ - "DownSample3D", - "DownSample3D", - "DownSample3D", - "" - ], - "encoder_temporal_downsample": [ - "", - "", - "", - "" - ], - "decoder_attention": "", - "decoder_nonlinearity": "swish", - "decoder_conv_in": "ContextParallelCausalConv3d", - "decoder_conv_out": "ContextParallelCausalConv3d", - "decoder_mid_resnet": "ContextParallelResnetBlock3D", - "decoder_resnet_blocks": [ - "ContextParallelResnetBlock3D", - "ContextParallelResnetBlock3D", - "ContextParallelResnetBlock3D", - "ContextParallelResnetBlock3D" - ], - "decoder_spatial_upsample": [ - "", - "Upsample3D", - "Upsample3D", - "Upsample3D" - ], - "decoder_temporal_upsample": [ - "", - "", - "", - "" - ], - "encoder_gather_norm": true, - "decoder_gather_norm": true, - "use_quant_layer": false, - "i2v_processor": { - "processor_id": "cogvideox_i2v_processor", - "noised_image_all_concat": false, - "noised_image_dropout": 0.05, - "noised_image_input": true - } - } +{ + "frames": 21, + "resolution": [768, 1360], + "allow_tf32": true, + "allow_internal_format":false, + "load_video_features": false, + "load_text_features": false, + "task": "i2v", + "predictor": { + "model_id": "satdit", + "from_pretrained": null, + "dtype": "bf16", + "num_layers": 2, + "num_heads": 48, + "head_dim": 64, + "in_channels": 32, + "out_channels": 16, + "dropout": 0.0, + "cross_attention_dim": null, + "attention_bias": true, + "input_size": [6, 96, 170], + "patch_type": "3D", + "patch_size": [2, 2, 2], + "activation_fn": "gelu-approximate", + "num_embeds_ada_norm": 1000, + "norm_type": "qk_ln", + "norm_elementwise_affine": true, + "norm_eps": 1e-5, + "caption_channels": null, + "time_embed_dim": 512, + "text_length": 224, + "text_hidden_size": 4096, + "concat_text_embed": true, + "interpolation_scale": [1.0, 1.0, 1.0], + "learnable_pos_embed": true, + "use_rope": true + }, + "diffusion": { + "model_id": "cogvideo_diffusion", + "sigma_sampler_config": { + "uniform_sampling": true, + "num_idx": 1000, + "discretization_config":{ + "shift_scale": 1.0 + } + }, + "denoiser_config": { + "num_idx": 1000, + "quantize_c_noise": false, + "discretization_config":{ + "shift_scale": 1.0 + } + } + }, + "text_encoder": { + "model_id": "T5", + "hub_backend": "hf", + "from_pretrained": "5b-cogvideo", + "dtype": "bf16", + "load_in_8bit": false, + "low_cpu_mem_usage": true, + "ucg_rate": 0.1, + "use_attention_mask": false + }, + "ae": { + "model_id": "contextparallelcasualvae", + "from_pretrained": "3d-vae.pt", + "cp_size": 1, + "dtype": "bf16", + "z_channels": 16, + "conv_padding": 0, + "num_res_blocks": 3, + "hidden_size_mult": [1,2,2,4], + "encoder_attention": "", + "encoder_nonlinearity": "swish", + "encoder_conv_in": "ContextParallelCausalConv3d", + "encoder_conv_out": "ContextParallelCausalConv3d", + "encoder_mid_resnet": "ContextParallelResnetBlock3D", + "encoder_resnet_blocks": [ + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D" + ], + "encoder_spatial_downsample": [ + "DownSample3D", + "DownSample3D", + "DownSample3D", + "" + ], + "encoder_temporal_downsample": [ + "", + "", + "", + "" + ], + "decoder_attention": "", + "decoder_nonlinearity": "swish", + "decoder_conv_in": "ContextParallelCausalConv3d", + "decoder_conv_out": "ContextParallelCausalConv3d", + "decoder_mid_resnet": "ContextParallelResnetBlock3D", + "decoder_resnet_blocks": [ + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D" + ], + "decoder_spatial_upsample": [ + "", + "Upsample3D", + "Upsample3D", + "Upsample3D" + ], + "decoder_temporal_upsample": [ + "", + "", + "", + "" + ], + "encoder_gather_norm": true, + "decoder_gather_norm": true, + "use_quant_layer": false, + "i2v_processor": { + "processor_id": "cogvideox_i2v_processor", + "noised_image_all_concat": false, + "noised_image_dropout": 0.05, + "noised_image_input": true + } + } } \ No newline at end of file diff --git a/examples/cogvideox/i2v_1.5/pretrain_cogvideox_i2v.sh b/examples/cogvideox/i2v_1.5/pretrain_cogvideox_i2v.sh new file mode 100644 index 00000000..820299fa --- /dev/null +++ b/examples/cogvideox/i2v_1.5/pretrain_cogvideox_i2v.sh @@ -0,0 +1,105 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=3 +export TASK_QUEUE_ENABLE=1 +export COMBINED_ENABLE=1 +export CPU_AFFINITY_CONF=1 +export HCCL_CONNECT_TIMEOUT=1200 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +GPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=29505 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +TP=1 +PP=1 +CP=1 +MBS=1 +GBS=$(($WORLD_SIZE*$MBS/$CP)) + +MM_DATA="./examples/cogvideox/i2v_1.5/data.json" +MM_MODEL="./examples/cogvideox/i2v_1.5/model_cogvideox_i2v_1.5.json" +MM_TOOL="./mindspeed_mm/tools/tools.json" + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +GPT_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --context-parallel-size ${CP} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --num-layers 1 \ + --hidden-size 3072 \ + --num-attention-heads 48 \ + --seq-length 24 \ + --max-position-embeddings 24 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --tokenizer-type NullTokenizer \ + --vocab-size 0 \ + --position-embedding-type rope \ + --rotary-base 500000 \ + --swiglu \ + --no-masked-softmax-fusion \ + --lr 1e-5 \ + --min-lr 1e-5 \ + --adam-beta1 0.9 \ + --adam-beta2 0.95 \ + --adam-eps 1e-8 \ + --lr-decay-style constant \ + --weight-decay 1e-4 \ + --lr-warmup-init 1e-4 \ + --lr-warmup-iters 0 \ + --clip-grad 1.0 \ + --train-iters 5000 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --no-save-optim \ + --no-save-rng \ + --bf16 \ + --recompute-granularity full \ + --recompute-method block \ + --recompute-num-layers 42 \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather +" + +MM_ARGS=" + --mm-data $MM_DATA \ + --mm-model $MM_MODEL \ + --mm-tool $MM_TOOL +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 10 \ +" + +logfile=$(date +%Y%m%d)_$(date +%H%M%S) +mkdir -p logs +torchrun $DISTRIBUTED_ARGS pretrain_sora.py \ + $GPT_ARGS \ + $MM_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl >> logs/train_${logfile}.log 2>&1 + +chmod 440 logs/train_${logfile}.log +STEP_TIME=`grep "elapsed time per iteration" logs/train_${logfile}.log | awk -F ':' '{print$5}' | awk -F '|' '{print$1}' | head -n 200 | tail -n 100 | awk '{sum+=$1} END {if (NR != 0) printf("%.1f",sum/NR)}'` +SPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'` +echo "Elapsed Time Per iteration: $STEP_TIME, Average Samples per Second: $SPS" \ No newline at end of file diff --git a/examples/cogvideox/t2v_1.0/data.json b/examples/cogvideox/t2v_1.0/data.json new file mode 100644 index 00000000..6ecd74aa --- /dev/null +++ b/examples/cogvideox/t2v_1.0/data.json @@ -0,0 +1,45 @@ +{ + "dataset_param": { + "dataset_type": "t2v", + "use_feature_data": false, + "basic_parameters": { + "data_path": "/data_path/data.jsonl", + "data_folder": "/data_path", + "data_storage_mode": "standard" + }, + "preprocess_parameters": { + "data_process_type": "CogvideoX", + "video_reader_type": "decoder", + "fps": 8, + "skip_frame_num": 3, + "num_frames": 25, + "max_height": 480, + "max_width": 720, + "dataloader_num_workers": 8, + "train_pipeline": { + "video": [], + "image": [] + } + }, + "use_text_processer": true, + "enable_text_preprocessing": false, + "model_max_length": 226, + "tokenizer_config": { + "hub_backend": "hf", + "autotokenizer_name": "T5Tokenizer", + "from_pretrained": "5b-cogvideo/tokenizer" + } + }, + "dataloader_param": { + "dataloader_mode": "sampler", + "sampler_type": "SequentialSampler", + "batch_size": 1, + "num_workers": 8, + "shuffle": true, + "drop_last": true, + "pin_memory": true, + "group_frame": false, + "group_resolution": false, + "collate_param": {} + } +} \ No newline at end of file diff --git a/examples/cogvideox/inference_cogvideox.sh b/examples/cogvideox/t2v_1.0/inference_cogvideox_t2v.sh similarity index 95% rename from examples/cogvideox/inference_cogvideox.sh rename to examples/cogvideox/t2v_1.0/inference_cogvideox_t2v.sh index 01a3cbbc..5204ddbf 100644 --- a/examples/cogvideox/inference_cogvideox.sh +++ b/examples/cogvideox/t2v_1.0/inference_cogvideox_t2v.sh @@ -14,7 +14,7 @@ CP=1 MBS=1 GBS=$(($WORLD_SIZE*$MBS/$CP/$TP)) -MM_MODEL="examples/cogvideox/inference_model.json" +MM_MODEL="examples/cogvideox/t2v_1.0/inference_model_t2v.json" LOAD_PATH="your_converted_dit_ckpt_dir" DISTRIBUTED_ARGS=" diff --git a/examples/cogvideox/inference_model.json b/examples/cogvideox/t2v_1.0/inference_model_t2v.json similarity index 99% rename from examples/cogvideox/inference_model.json rename to examples/cogvideox/t2v_1.0/inference_model_t2v.json index 1927114c..8dfdb4e4 100644 --- a/examples/cogvideox/inference_model.json +++ b/examples/cogvideox/t2v_1.0/inference_model_t2v.json @@ -85,6 +85,7 @@ "cross_attention_dim": null, "attention_bias": true, "input_size": [13, 60, 90], + "patch_type": "2D", "patch_size": [1, 2, 2], "activation_fn": "gelu-approximate", "num_embeds_ada_norm": 1000, diff --git a/examples/cogvideox/model_cogvideox.json b/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json similarity index 99% rename from examples/cogvideox/model_cogvideox.json rename to examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json index 367f3f06..cfe12471 100644 --- a/examples/cogvideox/model_cogvideox.json +++ b/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json @@ -37,6 +37,7 @@ "model_id": "cogvideo_diffusion", "sigma_sampler_config": { "uniform_sampling": true, + "group_num": 8, "num_idx": 1000, "discretization_config":{ "shift_scale": 1.0 diff --git a/examples/cogvideox/pretrain_cogvideox_t2v.sh b/examples/cogvideox/t2v_1.0/pretrain_cogvideox_t2v.sh similarity index 88% rename from examples/cogvideox/pretrain_cogvideox_t2v.sh rename to examples/cogvideox/t2v_1.0/pretrain_cogvideox_t2v.sh index 588aecb7..5226611a 100644 --- a/examples/cogvideox/pretrain_cogvideox_t2v.sh +++ b/examples/cogvideox/t2v_1.0/pretrain_cogvideox_t2v.sh @@ -7,6 +7,7 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True GPUS_PER_NODE=8 MASTER_ADDR=localhost @@ -21,8 +22,8 @@ CP=1 MBS=1 GBS=$(($WORLD_SIZE*$MBS/$CP)) -MM_DATA="./examples/cogvideox/data.json" -MM_MODEL="./examples/cogvideox/model_cogvideox.json" +MM_DATA="./examples/cogvideox/t2v_1.0/data.json" +MM_MODEL="./examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json" MM_TOOL="./mindspeed_mm/tools/tools.json" DISTRIBUTED_ARGS=" @@ -69,6 +70,7 @@ GPT_ARGS=" --no-save-optim \ --no-save-rng \ --bf16 \ + --qk-layernorm \ --recompute-granularity full \ --recompute-method block \ --recompute-num-layers 42 \ @@ -100,5 +102,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_sora.py \ chmod 440 logs/train_${logfile}.log STEP_TIME=`grep "elapsed time per iteration" logs/train_${logfile}.log | awk -F ':' '{print$5}' | awk -F '|' '{print$1}' | head -n 200 | tail -n 100 | awk '{sum+=$1} END {if (NR != 0) printf("%.1f",sum/NR)}'` -FPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'` -echo "Elapsed Time Per iteration: $STEP_TIME, Average FPS: $FPS" \ No newline at end of file +SPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'` +echo "Elapsed Time Per iteration: $STEP_TIME, Average Samples per Second: $SPS" \ No newline at end of file diff --git a/examples/cogvideox/t2v_1.5/data.json b/examples/cogvideox/t2v_1.5/data.json new file mode 100644 index 00000000..b15c5316 --- /dev/null +++ b/examples/cogvideox/t2v_1.5/data.json @@ -0,0 +1,45 @@ +{ + "dataset_param": { + "dataset_type": "t2v", + "use_feature_data": false, + "basic_parameters": { + "data_path": "/data_path/data.jsonl", + "data_folder": "/data_path", + "data_storage_mode": "standard" + }, + "preprocess_parameters": { + "data_process_type": "CogvideoX", + "video_reader_type": "decoder", + "fps": 8, + "skip_frame_num": 3, + "num_frames": 21, + "max_height": 768, + "max_width": 1360, + "dataloader_num_workers": 8, + "train_pipeline": { + "video": [], + "image": [] + } + }, + "use_text_processer": true, + "enable_text_preprocessing": false, + "model_max_length": 224, + "tokenizer_config": { + "hub_backend": "hf", + "autotokenizer_name": "T5Tokenizer", + "from_pretrained": "5b-cogvideo/tokenizer" + } + }, + "dataloader_param": { + "dataloader_mode": "sampler", + "sampler_type": "SequentialSampler", + "batch_size": 1, + "num_workers": 8, + "shuffle": true, + "drop_last": true, + "pin_memory": true, + "group_frame": false, + "group_resolution": false, + "collate_param": {} + } +} \ No newline at end of file diff --git a/examples/cogvideox/t2v_1.5/model_cogvideox_t2v_1.5.json b/examples/cogvideox/t2v_1.5/model_cogvideox_t2v_1.5.json new file mode 100644 index 00000000..33e6059c --- /dev/null +++ b/examples/cogvideox/t2v_1.5/model_cogvideox_t2v_1.5.json @@ -0,0 +1,123 @@ +{ + "frames": 21, + "resolution": [768, 1360], + "allow_tf32": true, + "allow_internal_format":false, + "load_video_features": false, + "load_text_features": false, + "task": "t2v", + "predictor": { + "model_id": "satdit", + "from_pretrained": "mmdit.pt", + "dtype": "bf16", + "num_layers": 2, + "num_heads": 48, + "head_dim": 64, + "in_channels": 16, + "out_channels": 16, + "dropout": 0.0, + "cross_attention_dim": null, + "attention_bias": true, + "input_size": [6, 96, 170], + "patch_type": "3D", + "patch_size": [2, 2, 2], + "activation_fn": "gelu-approximate", + "num_embeds_ada_norm": 1000, + "norm_type": "qk_ln", + "norm_elementwise_affine": true, + "norm_eps": 1e-5, + "caption_channels": null, + "time_embed_dim": 512, + "text_length": 224, + "text_hidden_size": 4096, + "concat_text_embed": true, + "interpolation_scale": [1.0, 1.0, 1.0], + "use_rope": true + }, + "diffusion": { + "model_id": "cogvideo_diffusion", + "sigma_sampler_config": { + "uniform_sampling": true, + "num_idx": 1000, + "discretization_config":{ + "shift_scale": 1.0 + } + }, + "denoiser_config": { + "num_idx": 1000, + "quantize_c_noise": false, + "discretization_config":{ + "shift_scale": 1.0 + } + } + }, + "text_encoder": { + "model_id": "T5", + "hub_backend": "hf", + "from_pretrained": "5b-cogvideo", + "dtype": "bf16", + "load_in_8bit": false, + "low_cpu_mem_usage": true, + "ucg_rate": 0.1, + "use_attention_mask": false + }, + "ae": { + "model_id": "contextparallelcasualvae", + "from_pretrained": "3d-vae.pt", + "cp_size": 1, + "dtype": "bf16", + "z_channels": 16, + "conv_padding": 0, + "num_res_blocks": 3, + "hidden_size_mult": [1,2,2,4], + "encoder_attention": "", + "encoder_nonlinearity": "swish", + "encoder_conv_in": "ContextParallelCausalConv3d", + "encoder_conv_out": "ContextParallelCausalConv3d", + "encoder_mid_resnet": "ContextParallelResnetBlock3D", + "encoder_resnet_blocks": [ + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D" + ], + "encoder_spatial_downsample": [ + "DownSample3D", + "DownSample3D", + "DownSample3D", + "" + ], + "encoder_temporal_downsample": [ + "", + "", + "", + "" + ], + "decoder_attention": "", + "decoder_nonlinearity": "swish", + "decoder_conv_in": "ContextParallelCausalConv3d", + "decoder_conv_out": "ContextParallelCausalConv3d", + "decoder_mid_resnet": "ContextParallelResnetBlock3D", + "decoder_resnet_blocks": [ + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D" + ], + "decoder_spatial_upsample": [ + "", + "Upsample3D", + "Upsample3D", + "Upsample3D" + ], + "decoder_temporal_upsample": [ + "", + "", + "", + "" + ], + "encoder_gather_norm": true, + "decoder_gather_norm": true, + "use_quant_layer": false + } +} diff --git a/examples/cogvideox/t2v_1.5/pretrain_cogvideox_t2v_1.5.sh b/examples/cogvideox/t2v_1.5/pretrain_cogvideox_t2v_1.5.sh new file mode 100644 index 00000000..3aa1c375 --- /dev/null +++ b/examples/cogvideox/t2v_1.5/pretrain_cogvideox_t2v_1.5.sh @@ -0,0 +1,105 @@ +#!/bin/bash +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=3 +export TASK_QUEUE_ENABLE=1 +export COMBINED_ENABLE=1 +export CPU_AFFINITY_CONF=1 +export HCCL_CONNECT_TIMEOUT=1200 +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +GPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=29505 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) + +TP=1 +PP=1 +CP=1 +MBS=1 +GBS=$(($WORLD_SIZE*$MBS/$CP)) + +MM_DATA="./examples/cogvideox/t2v_1.5/data.json" +MM_MODEL="./examples/cogvideox/t2v_1.5/model_cogvideox_t2v_1.5.json" +MM_TOOL="./mindspeed_mm/tools/tools.json" + +DISTRIBUTED_ARGS=" + --nproc_per_node $GPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +GPT_ARGS=" + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --context-parallel-size ${CP} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --num-layers 1 \ + --hidden-size 3072 \ + --num-attention-heads 48 \ + --seq-length 24 \ + --max-position-embeddings 24 \ + --attention-dropout 0.0 \ + --hidden-dropout 0.0 \ + --tokenizer-type NullTokenizer \ + --vocab-size 0 \ + --position-embedding-type rope \ + --rotary-base 500000 \ + --swiglu \ + --no-masked-softmax-fusion \ + --lr 1e-4 \ + --min-lr 1e-4 \ + --adam-beta1 0.9 \ + --adam-beta2 0.999 \ + --adam-eps 1e-8 \ + --lr-decay-style constant \ + --weight-decay 1e-2 \ + --lr-warmup-init 1e-4 \ + --lr-warmup-iters 500 \ + --clip-grad 1.0 \ + --train-iters 5000 \ + --no-gradient-accumulation-fusion \ + --no-load-optim \ + --no-load-rng \ + --no-save-optim \ + --no-save-rng \ + --bf16 \ + --recompute-granularity full \ + --recompute-method block \ + --recompute-num-layers 42 \ + --use-distributed-optimizer \ + --overlap-grad-reduce \ + --overlap-param-gather +" + +MM_ARGS=" + --mm-data $MM_DATA \ + --mm-model $MM_MODEL \ + --mm-tool $MM_TOOL +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 10 \ +" + +logfile=$(date +%Y%m%d)_$(date +%H%M%S) +mkdir -p logs +torchrun $DISTRIBUTED_ARGS pretrain_sora.py \ + $GPT_ARGS \ + $MM_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl >> logs/train_${logfile}.log 2>&1 + +chmod 440 logs/train_${logfile}.log +STEP_TIME=`grep "elapsed time per iteration" logs/train_${logfile}.log | awk -F ':' '{print$5}' | awk -F '|' '{print$1}' | head -n 200 | tail -n 100 | awk '{sum+=$1} END {if (NR != 0) printf("%.1f",sum/NR)}'` +SPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'` +echo "Elapsed Time Per iteration: $STEP_TIME, Average Samples per Second: $SPS" \ No newline at end of file diff --git a/examples/diffusers/flux/README.md b/examples/diffusers/flux/README.md index c33281ac..d6efbc65 100644 --- a/examples/diffusers/flux/README.md +++ b/examples/diffusers/flux/README.md @@ -264,7 +264,7 @@ ``` - 在文件上方的import栏增加`DistributedType`在`from accelerate import Acceleratore`后 (30行附近) - - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED` (1669/1788行附近) + - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED` (1669/1788行附近),并在`if args.checkpoints_total_limit is not None`后增加`and accelerator.is_main_process` ```python from accelerate import Accelerator, DistributedType @@ -272,6 +272,8 @@ if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED: # if accelerator.is_main_process: # 原代码 + if global_step % args.checkpointing_steps == 0: # 原代码 不进行修改 + if args.checkpoints_total_limit is not None and accelerator.is_main_process: # 添加 ``` Lora任务需调用patch任务进行权重保存: @@ -391,6 +393,28 @@ vim infer_flux_text2img_bf16.py # 进入运行推理的Python文件 ```shell python infer_flux_text2img_lora_bf16.py ``` + + 【分布式推理】 + + ```shell + vim infer_flux_text2img_distrib.py + ``` + +- 修改模型权重路径 model_path为模型权重路径或微调后的权重路径 +- 如lora微调 可将lora_weights修改为Lora权重路径 + + ```python + model_path = "/black-forest-labs/FLUX.1-dev" # 模型权重/微调权重路径 + lora_weights = "/pytorch_lora_weights.safetensors" # Lora权重路径 + ``` + +- 启动分布式推理脚本 + + - 因使用accelerate进行分布式推理,config可设置:`--num_processes=卡数`,`num_machines=机器数`等 + + ```shell + accelerate launch --num_processes=4 infer_flux_text2img_distrib.py # 单机四卡进行分布式推理 + ``` diff --git a/examples/diffusers/flux/infer_flux_text2img_distrib.py b/examples/diffusers/flux/infer_flux_text2img_distrib.py new file mode 100644 index 00000000..680aadd1 --- /dev/null +++ b/examples/diffusers/flux/infer_flux_text2img_distrib.py @@ -0,0 +1,63 @@ +import os + +import torch +from accelerate import PartialState +from diffusers import FluxPipeline + +output_path = "./flux_lora_NPU" +os.makedirs(output_path, exist_ok=True) + +MODEL_PATH = "/black-forest-labs/FLUX.1-dev" # FLUX模型路径 +LORA_WEIGHTS = "./output/pytorch_lora_weights.safetensors" # LoRA权重路径 +pipe = FluxPipeline.from_pretrained( + MODEL_PATH, torch_dtype=torch.bfloat16, local_files_only=True +) + +if os.path.exists(LORA_WEIGHTS): + print(f"Loading LoRA weights from {LORA_WEIGHTS}") + pipe.load_lora_weights(LORA_WEIGHTS) +else: + print("LoRA weights not found. Using the base model") + +distributed_state = PartialState() +pipe.to(distributed_state.device) + +PROMPTS = [ + "masterpiece, best quality, Cute dragon creature, pokemon style, night, moonlight, dim lighting", + "masterpiece, best quality, Pikachu walking in beijing city, pokemon style, night, moonlight, dim lighting", + "masterpiece, best quality, red panda , pokemon style, evening light, sunset, rim lighting", + "masterpiece, best quality, Photo of (Lion:1.2) on a couch, flower in vase, dof, film grain, crystal clear, pokemon style, dark studio", + "masterpiece, best quality, siberian cat pokemon on river, pokemon style, evening light, sunset, rim lighting, depth of field", + "masterpiece, best quality, pig, Exquisite City, (sky:1.3), (Miniature tree:1.3), Miniature object, many flowers, glowing mushrooms, (creek:1.3), lots of fruits, cute colorful animal protagonist, Firefly, meteor, Colorful cloud, pokemon style, Complicated background, rainbow,", + "masterpiece, best quality, (pokemon), a cute pikachu, girl with glasses, (masterpiece, top quality, best quality, official art, beautiful and aesthetic:1.2),", + "masterpiece, best quality, sugimori ken \(style\), (pokemon \(creature\)), pokemon electric type, grey and yellow skin, mechanical arms, cyberpunk city background, night, neon light", +] +# 设置随机数种子 +seed_list = [8, 23, 42, 1334] + +for i in seed_list: + generator = torch.Generator(device="npu").manual_seed(i) + + with distributed_state.split_between_processes(PROMPTS) as prompts: + for prompt in prompts: + image = pipe( + prompt=prompt, + generator=generator, + num_inference_steps=28, + height=1024, + width=1024, + guidance_scale=1.0, + ).images + + # Create name for the image + prompt_words = prompt.replace("masterpiece, best quality, ", "").split()[:3] + prompt_abbr = "_".join(prompt_words) + + filename = ( + f"{prompt_abbr}_seed{i}_rank{distributed_state.process_index}.png" + ) + filename = "".join( + c for c in filename if c.isalnum() or c in "._-" + ) # remove special chars + + image[0].save(f"{output_path}/{filename}") diff --git a/examples/diffusers/flux/infer_flux_text2img_lora_bf16.py b/examples/diffusers/flux/infer_flux_text2img_lora_bf16.py index eb4d78fd..0336e14a 100644 --- a/examples/diffusers/flux/infer_flux_text2img_lora_bf16.py +++ b/examples/diffusers/flux/infer_flux_text2img_lora_bf16.py @@ -1,4 +1,5 @@ import os + import torch from diffusers import AutoPipelineForText2Image diff --git a/examples/diffusers/sd3/README.md b/examples/diffusers/sd3/README.md index e32829e1..e4c43273 100644 --- a/examples/diffusers/sd3/README.md +++ b/examples/diffusers/sd3/README.md @@ -237,7 +237,7 @@ torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huaw ``` - 在文件上方的import栏增加`DistributedType`在`from accelerate import Acceleratore`后 (30行附近),并增加patch引用`from patch_sd3 import create_save_model_hook` - - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED`(dreambooth在1681行附近,lora在1833行附近) + - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED`(dreambooth在1681行附近,lora在1833行附近),并在`if args.checkpoints_total_limit is not None`后增加`and accelerator.is_main_process` ```python from accelerate import Accelerator, DistributedType @@ -247,6 +247,8 @@ torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huaw if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED: # if accelerator.is_main_process: # 原代码 1681/1833行附近 + if global_step % args.checkpointing_steps == 0: # 原代码 不进行修改 + if args.checkpoints_total_limit is not None and accelerator.is_main_process: # 添加 ``` Lora任务需调用patch任务进行权重保存: @@ -360,7 +362,7 @@ vim infer_sd3_img2img.py # 进入运行I2I推理的Python文件 python infer_sd3_img2img.py # 单卡推理,图生图 ``` - 【lora微调FLUX模型推理】 + 【lora微调SD3模型推理】 ```shell vim infer_sd3_text2img_lora.py @@ -379,6 +381,28 @@ vim infer_sd3_img2img.py # 进入运行I2I推理的Python文件 python infer_sd3_text2img_lora.py ``` + 【分布式推理】 + + ```shell + vim infer_sd3_text2img_distrib.py + ``` + +- 修改模型权重路径 model_path为模型权重路径或微调后的权重路径 +- 如lora微调 可将lora_weights修改为Lora权重路径 + + ```python + model_path = "stabilityai/stable-diffusion-3.5-large" # 模型权重/微调权重路径 + lora_weights = "/pytorch_lora_weights.safetensors" # Lora权重路径 + ``` + +- 启动分布式推理脚本 + + - 因使用accelerate进行分布式推理,config可设置:`--num_processes=卡数`,`num_machines=机器数`等 + + ```shell + accelerate launch --num_processes=4 infer_sd3_text2img_distrib.py # 单机四卡进行分布式推理 + ``` + ## 使用基线数据集进行评估 ## 引用 diff --git a/examples/diffusers/sd3/infer_sd3_text2img_distrib.py b/examples/diffusers/sd3/infer_sd3_text2img_distrib.py new file mode 100644 index 00000000..102480e7 --- /dev/null +++ b/examples/diffusers/sd3/infer_sd3_text2img_distrib.py @@ -0,0 +1,75 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# Copyright 2024 Stability AI, The HuggingFace Team and The InstantX Team. All rights reserved. +# Copyright 2024 Stability AI and The HuggingFace Team + +import os + +import torch +from accelerate import PartialState +from diffusers import StableDiffusion3Pipeline + +output_path = "./infer_result_lora" +os.makedirs(output_path, exist_ok=True) + +MODEL_PATH = "stabilityai/stable-diffusion-3.5-large" # 模型路径 +LORA_WEIGHTS = "./output/pytorch_lora_weights.safetensors" # LoRA权重路径 +DTYPE = torch.float16 # 混精模式 + +pipe = StableDiffusion3Pipeline.from_pretrained( + MODEL_PATH, + torch_dtype=DTYPE, + local_files_only=True, +) + +if os.path.exists(LORA_WEIGHTS): + print(f"Loading LoRA weights from {LORA_WEIGHTS}") + pipe.load_lora_weights(LORA_WEIGHTS) +else: + print("LoRA weights not found. Using the base model") + +distributed_state = PartialState() +pipe.to(distributed_state.device) + +prompts = dict() +prompts["masterpiece, best quality, Cute dragon creature, pokemon style, night, moonlight, dim lighting"] = "deformed, disfigured, underexposed, overexposed, rugged, (low quality), (normal quality)," +prompts["masterpiece, best quality, Pikachu walking in beijing city, pokemon style, night, moonlight, dim lighting"] = "deformed, disfigured, underexposed, overexposed, (low quality), (normal quality)," +prompts["masterpiece, best quality, red panda , pokemon style, evening light, sunset, rim lighting"] = "deformed, disfigured, underexposed, overexposed, (low quality), (normal quality)," +prompts["masterpiece, best quality, Photo of (Lion:1.2) on a couch, flower in vase, dof, film grain, crystal clear, pokemon style, dark studio"] = "deformed, disfigured, underexposed, overexposed, " +prompts["masterpiece, best quality, siberian cat pokemon on river, pokemon style, evening light, sunset, rim lighting, depth of field"] = "deformed, disfigured, underexposed, overexposed, " +prompts["masterpiece, best quality, pig, Exquisite City, (sky:1.3), (Miniature tree:1.3), Miniature object, many flowers, glowing mushrooms, (creek:1.3), lots of fruits, cute colorful animal protagonist, Firefly, meteor, Colorful cloud, pokemon style, Complicated background, rainbow,"] = "Void background,black background,deformed, disfigured, underexposed, overexposed, " +prompts["masterpiece, best quality, (pokemon), a cute pikachu, girl with glasses, (masterpiece, top quality, best quality, official art, beautiful and aesthetic:1.2),"] = "(low quality), (normal quality), (monochrome), lowres, extra fingers, fewer fingers, (watermark), " +prompts["masterpiece, best quality, sugimori ken \(style\), (pokemon \(creature\)), pokemon electric type, grey and yellow skin, mechanical arms, cyberpunk city background, night, neon light"] = "(worst quality, low quality:1.4), watermark, signature, deformed, disfigured, underexposed, overexposed, " + +# 设置随机数种子 +seed_list = [8, 23, 42, 1334] + +# 输出图片 +for i in seed_list: + generator = torch.Generator(device="npu").manual_seed(i) + + # Convert dictionary to list + prompt_list = list(prompts.keys()) + negative_prompt_list = list(prompts.values()) + + with distributed_state.split_between_processes( + list(zip(prompt_list, negative_prompt_list)) + ) as distributed_pairs: + for prompt, negative_prompt in distributed_pairs: + image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + generator=generator, + num_inference_steps=28, + height=1024, + width=1024, + guidance_scale=1.0, + ).images + + # Create name for the image + prompt_words = prompt.replace("masterpiece, best quality, ", "").split()[:3] + prompt_abbr = "_".join(prompt_words) + + filename = f"{prompt_abbr}_seed{i}_rank{distributed_state.process_index}.png" + filename = "".join(c for c in filename if c.isalnum() or c in "._-") # remove special chars + + image[0].save(f"{output_path}/{filename}") diff --git a/examples/diffusers/sdxl/README.md b/examples/diffusers/sdxl/README.md index cf48c13a..3d6de2b4 100644 --- a/examples/diffusers/sdxl/README.md +++ b/examples/diffusers/sdxl/README.md @@ -454,20 +454,44 @@ SDXL 在 **昇腾芯片** 和 **参考芯片** 上的性能对比: python sdxl/sdxl_img2img_infer.py # 混精fp16 图生图微调任务推理 ``` +【分布式推理】 + +- 对`sdxl/sdxl_text2img_distrib_infer.py`文件进行修改 + + ```shell + vim sdxl/sdxl_text2img_distrib_infer.py + ``` + +- 修改模型权重路径 model_path为模型权重路径或微调后的权重路径 +- 如lora微调 可将lora_weights修改为Lora权重路径 + + ```python + model_path = "/stabilityai/stable-diffusion-xl-base-1.0" # 模型权重/微调权重路径 + lora_weights = "/pytorch_lora_weights.safetensors" # Lora权重路径 + ``` + +- 启动分布式推理脚本 + + - 因使用accelerate进行分布式推理,config可设置:`--num_processes=卡数`,`num_machines=机器数`等 + + ```shell + accelerate launch --num_processes=4 sdxl/sdxl_text2img_distrib_infer.py # 单机四卡进行分布式推理 + ``` + ### 性能 | 芯片 | 卡数 | 任务 | E2E(it/s) | AMP_Type | Torch_Version | deepspeed | |:---:|:---:|:----------:|:-----:|:---:|:---:|:---:| -| 竞品A | 8p | 文生图lora | 1.45 | fp16 | 2.1 | ✔ | +| 竞品A | 1p | 文生图lora | 1.45 | fp16 | 2.1 | ✔ | | Atlas 900 A2 PODc |8p | 文生图lora | 2.61 | fp16 | 2.1 | ✔ | -| 竞品A | 8p | 文生图controlnet | 1.41 | fp16 | 2.1 | ✔ | -| Atlas 900 A2 PODc |8p | 文生图controlnet | 2.97 | fp16 | 2.1 | ✔ | -| 竞品A | 8p | 文生图全参 | 1.55 | fp16 | 2.1 | ✔ | -| Atlas 900 A2 PODc |8p | 文生图全参 | 3.02 | fp16 | 2.1 | ✔ | -| 竞品A | 8p | 图生图 | 3.56 | fp16 | 2.1 | ✔ | -| Atlas 900 A2 PODc |8p | 图生图 | 3.94 | fp16 | 2.1 | ✔ | +| 竞品A | 1p | 文生图controlnet | 1.41 | fp16 | 2.1 | ✔ | +| Atlas 900 A2 PODc |1p | 文生图controlnet | 2.97 | fp16 | 2.1 | ✔ | +| 竞品A | 1p | 文生图全参 | 1.55 | fp16 | 2.1 | ✔ | +| Atlas 900 A2 PODc |1p | 文生图全参 | 3.02 | fp16 | 2.1 | ✔ | +| 竞品A | 1p | 图生图 | 3.56 | fp16 | 2.1 | ✔ | +| Atlas 900 A2 PODc |1p | 图生图 | 3.94 | fp16 | 2.1 | ✔ | ## 引用 diff --git a/examples/diffusers/sdxl/sdxl_text2img_distrib_infer.py b/examples/diffusers/sdxl/sdxl_text2img_distrib_infer.py new file mode 100644 index 00000000..0c99c38c --- /dev/null +++ b/examples/diffusers/sdxl/sdxl_text2img_distrib_infer.py @@ -0,0 +1,72 @@ +# Copyright 2024 Huawei Technologies Co., Ltd +# Copyright 2023 The HuggingFace Team. All rights reserved. + + +import random +import os +from diffusers import DiffusionPipeline +import torch +import torch_npu +from accelerate import PartialState +from torch_npu.contrib import transfer_to_npu +import numpy as np + +output_path = "./sdxl_lora_NPU" +os.makedirs(output_path, exist_ok=True) + +model_path = "/stabilityai/stable-diffusion-xl-base-1.0" # Path for base model +lora_weights = "/pytorch_lora_weights.safetensors" # Path for LoRA weights + +pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float32, local_files_only=True) + +if os.path.exists(lora_weights): + print(f"Loading LoRA weights from {lora_weights}") + pipe.load_lora_weights(lora_weights) +else: + print("LoRA weights not found. Using the base model") + +distributed_state = PartialState() +pipe.to(distributed_state.device) + +prompts = dict() +prompts["masterpiece, best quality, Cute dragon creature, pokemon style, night, moonlight, dim lighting"] = "deformed, disfigured, underexposed, overexposed, rugged, (low quality), (normal quality)," +prompts["masterpiece, best quality, Pikachu walking in beijing city, pokemon style, night, moonlight, dim lighting"] = "deformed, disfigured, underexposed, overexposed, (low quality), (normal quality)," +prompts["masterpiece, best quality, red panda , pokemon style, evening light, sunset, rim lighting"] = "deformed, disfigured, underexposed, overexposed, (low quality), (normal quality)," +prompts["masterpiece, best quality, Photo of (Lion:1.2) on a couch, flower in vase, dof, film grain, crystal clear, pokemon style, dark studio"] = "deformed, disfigured, underexposed, overexposed, " +prompts["masterpiece, best quality, siberian cat pokemon on river, pokemon style, evening light, sunset, rim lighting, depth of field"] = "deformed, disfigured, underexposed, overexposed, " +prompts["masterpiece, best quality, pig, Exquisite City, (sky:1.3), (Miniature tree:1.3), Miniature object, many flowers, glowing mushrooms, (creek:1.3), lots of fruits, cute colorful animal protagonist, Firefly, meteor, Colorful cloud, pokemon style, Complicated background, rainbow,"] = "Void background,black background,deformed, disfigured, underexposed, overexposed, " +prompts["masterpiece, best quality, (pokemon), a cute pikachu, girl with glasses, (masterpiece, top quality, best quality, official art, beautiful and aesthetic:1.2),"] = "(low quality), (normal quality), (monochrome), lowres, extra fingers, fewer fingers, (watermark), " +prompts["masterpiece, best quality, sugimori ken \(style\), (pokemon \(creature\)), pokemon electric type, grey and yellow skin, mechanical arms, cyberpunk city background, night, neon light"] = "(worst quality, low quality:1.4), watermark, signature, deformed, disfigured, underexposed, overexposed, " +#设置随机数种子 +seed_list = [8, 23, 42, 1334] + +# 输出图片 +for i in seed_list: + generator = torch.Generator(device="npu").manual_seed(i) + + # Convert dictionary to list + prompt_list = list(prompts.keys()) + negative_prompt_list = list(prompts.values()) + + with distributed_state.split_between_processes( + list(zip(prompt_list, negative_prompt_list)) + ) as distributed_pairs: + for prompt, negative_prompt in distributed_pairs: + image = pipe( + prompt=prompt, + negative_prompt=negative_prompt, + generator=generator, + num_inference_steps=28, + height=1024, + width=1024, + guidance_scale=1.0, + ).images + + # Create name for the image + prompt_words = prompt.replace("masterpiece, best quality, ", "").split()[:3] + prompt_abbr = "_".join(prompt_words) + + filename = f"{prompt_abbr}_seed{i}_rank{distributed_state.process_index}.png" + filename = "".join(c for c in filename if c.isalnum() or c in "._-") # remove special chars + + image[0].save(f"{output_path}/{filename}") diff --git a/examples/internvl2/README.md b/examples/internvl2/README.md index db89a79b..b8bc5af1 100644 --- a/examples/internvl2/README.md +++ b/examples/internvl2/README.md @@ -20,7 +20,10 @@ - [准备工作](#jump5.1) - [配置参数](#jump5.2) - [启动推理](#jump5.3) - +- [评测](#jump6) + - [数据集准备](#jump6.1) + - [配置参数](#jump6.2) + - [启动评测](#jump6.3) --- @@ -102,7 +105,7 @@ torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huaw git clone https://gitee.com/ascend/MindSpeed.git cd MindSpeed # checkout commit from MindSpeed core_r0.6.0 - git checkout 4c6847e6fda0a458914fd2ea664f6d09a8be300e + git checkout ab39de78be23e88e2c8b0d25edf6135940990c02 pip install -r requirements.txt pip3 install -e . cd .. @@ -131,15 +134,15 @@ torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huaw #### 2. 权重转换 -MindSpeeed-MM修改了部分原始网络的结构名称,使用`examples/internvl2/internvl_convert_to_mm_ckpt.py`脚本对原始预训练权重进行转换。该脚本实现了从huggingface权重到MindSpeed-MM权重的转换以及PP(Pipeline Parallel)权重的切分。 +MindSpeeed-MM修改了部分原始网络的结构名称,使用`examples/internvl2/internvl2_convert_to_mm_ckpt.py`脚本对原始预训练权重进行转换。该脚本实现了从huggingface权重到MindSpeed-MM权重的转换以及PP(Pipeline Parallel)权重的切分。 -以InternVL2-8B为例,`inernvl_convert_to_mm_ckpt.py`的入参`model-size`、`load-dir`、`save-dir`、`trust-remote-code`等如下: +以InternVL2-8B为例,`internvl2_convert_to_mm_ckpt.py`的入参`model-size`、`load-dir`、`save-dir`、`trust-remote-code`等如下: 启动脚本 ```shell # 根据实际情况修改 ascend-toolkit 路径 source /usr/local/Ascend/ascend-toolkit/set_env.sh - python examples/internvl2/internvl_convert_to_mm_ckpt.py \ + python examples/internvl2/internvl2_convert_to_mm_ckpt.py \ --model-size 8B \ --load-dir raw_ckpt/InternVL2-8B \ # huggingface权重目录 --save-dir pretrained/InternVL2-8B \ # 转换后的权重保存目录 @@ -260,12 +263,12 @@ $save_dir ```shell # 根据实际情况修改 ascend-toolkit 路径 source /usr/local/Ascend/ascend-toolkit/set_env.sh - GPUS_PER_NODE=8 + NPUS_PER_NODE=8 MASTER_ADDR=locahost MASTER_PORT=6000 NNODES=1 NODE_RANK=0 - WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES)) + WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES)) ``` @@ -289,7 +292,7 @@ $save_dir ```shell # 根据实际情况修改 ascend-toolkit 路径 source /usr/local/Ascend/ascend-toolkit/set_env.sh - python examples/internvl2/internvl_convert_to_mm_ckpt.py \ + python examples/internvl2/internvl2_convert_to_mm_ckpt.py \ --model-size 8B \ --load-dir raw_ckpt/InternVL2-8B \ # huggingface权重目录 --save-dir pretrained/InternVL2-8B \ # 转换后的权重保存目录 @@ -331,3 +334,66 @@ $save_dir ```shell bash examples/internvl2/inference_internvl.sh ``` + + +## 评测 + +### 数据集准备 + +当前模型支持AI2D(test)、ChartQA(test)、Docvqa(val)、MMMU(val)四种数据集的评测。 +数据集参考下载链接: + +- [MMMU_DEV_VAL](https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv) +- [DocVQA_VAL](https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv) +- [AI2D_TEST](https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv) +- [ChartQA_TEST](https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv) + +### 参数配置 +如果要进行评测需要将要评测的数据集名称和路径传到examples/internvl2/evaluate_internvl2_8B.json +需要更改的字段有 + +- `from_pretrained` 需要改为模型的权重文件的路径,如果使用的是huggingface的权重则需要进行权重转换(参考前面的权重转换的章节),如果使用MindSpeed-MM训练出的则不需要进行权重转换。 +- `dataset_path` 需要填入上面下载的数据集文件路径。 +- `evaluation_dataset` 为评测数据集的名称可选的名称有(`ai2d_test`、`mmmu_dev_val`、`docvqa_val`、`chartqa_test`), **注意**:需要与上面的数据集路径相对应。 +- `result_output_path` 为评测结果的输出路径,**注意**:每次评测前需要将之前保存在该路径下评测文件删除。 +- `tokenizer`下面的`from_pretrained`为huggingface下载的InternVL2-8B权重路径。 + +```json + "model_id": "InternVLPipeline", + "from_pretrained": "./internvl8b_mm/release/mp_rank_00/model_optim_rng.pt", + "dataset_path": "./AI2D_TEST.tsv", + "evaluation_dataset":"ai2d_test", + "evaluation_model":"internvl2_8b", + "result_output_path":"./evaluation_outputs/", + + "tokenizer":{ + "hub_backend": "hf", + "autotokenizer_name": "AutoTokenizer", + "from_pretrained": "./InternVL2-8B", + "model_max_length": 4096, + "add_eos_token": false, + "trust_remote_code": true, + "use_fast": false + } + +``` + +examples/internvl2/evaluate_internvl2_8B.json改完后,需要将json文件的路径传入到examples/internvl2/evaluate_internvl2_8B.sh MM_MODEL字段中 + +```shell +MM_MODEL=examples/internvl2/evaluate_internvl2_8B.json +``` +评测支持多卡DP推理需要更改的配置,为NPU卡数量 + +```shell +NPUS_PER_NODE=1 +``` + +### 启动评测 +启动shell开始推理 +```shell +bash examples/internvl2/evaluate_internvl2_8B.sh +``` +评测结果会输出到`result_output_path`路径中,会输出结果文件: +- *.xlsx文件,这个文件会输出每道题的预测结果和答案等详细信息。 +- *.csv文件,这个文件会输出统计准确率等数据。 \ No newline at end of file diff --git a/examples/internvl2/dot_product_attention.py b/examples/internvl2/dot_product_attention.py index 4ace9800..e38e5596 100644 --- a/examples/internvl2/dot_product_attention.py +++ b/examples/internvl2/dot_product_attention.py @@ -103,132 +103,59 @@ def dot_product_attention_forward( ): args = get_args() - if not torch.any(attention_mask): - if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1: - key = key.repeat_interleave( - self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2 - ) - value = value.repeat_interleave( - self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2 - ) - - seq_length, batch_size, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3] - - query, key, value = [rearrange(x, 's b h d -> (s b) h d') for x in [query, key, value]] - - scale = 1.0 / math.sqrt( - self.hidden_size_per_attention_head) if self.scale_mask_softmax.scale is None else self.softmax_scale - - if args.context_parallel_size > 1 and args.context_parallel_algo in ['megatron_cp_algo', 'hybrid_cp_algo']: - in_hybrid_mode = False - if get_context_parallel_group_for_hybrid_ring(check_initialized=False) is not None: - in_hybrid_mode = True - - if not in_hybrid_mode: - cp_group = mpu.get_context_parallel_group() - cp_size = mpu.get_context_parallel_world_size() - rank = mpu.get_context_parallel_rank() - cp_global_ranks = mpu.get_context_parallel_global_ranks() - else: - cp_group = get_context_parallel_group_for_hybrid_ring() - cp_size = get_context_parallel_for_hybrid_ring_world_size() - rank = get_context_parallel_for_hybrid_ring_rank() - cp_global_ranks = get_context_parallel_for_hybrid_ring_global_ranks() - - cp_para = dict() - cp_para['causal'] = args.cp_attention_mask_type == 'causal' - cp_para['cp_group'] = cp_group - cp_para['cp_size'] = cp_size - cp_para['rank'] = rank - cp_para['cp_global_ranks'] = cp_global_ranks - cp_para['cp_group_for_send_recv_overlap'] = mpu.get_context_parallel_group_for_send_recv_overlap() \ - if args.use_cp_send_recv_overlap else None - cp_para['pse'] = self.pse - cp_para['pse_type'] = self.pse_type - output = ringattn_context_parallel(query, key, value, n_head, cp_para, scale, attention_mask, - self.attention_dropout.p) + seq_length, batch_size, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3] + + query, key, value = [x.transpose(0, 1) for x in [query, key, value]] + + scale = 1.0 / math.sqrt(self.hidden_size_per_attention_head) if self.scale_mask_softmax.scale is None else self.softmax_scale + + if args.context_parallel_size > 1 and args.context_parallel_algo in ['megatron_cp_algo', 'hybrid_cp_algo']: + in_hybrid_mode = False + if get_context_parallel_group_for_hybrid_ring(check_initialized=False) is not None: + in_hybrid_mode = True + + if not in_hybrid_mode: + cp_group = mpu.get_context_parallel_group() + cp_size = mpu.get_context_parallel_world_size() + rank = mpu.get_context_parallel_rank() + cp_global_ranks = mpu.get_context_parallel_global_ranks() else: - if args.use_fusion_attn_v2: - output = npu_fusion_attention( - query, key, value, n_head, 'SBH', - pse=self.pse, - padding_mask=None, - atten_mask=attention_mask, - scale=scale, - pse_type=self.pse_type, - pre_tokens=args.pre_tockens, - next_tokens=args.next_tockens, - keep_prob=1 - self.dropout_p, - inner_precise=0, - sparse_mode=args.sparse_mode - )[0] - else: - cu_seqlens = tuple( - torch.arange(seq_length, (batch_size + 1) * seq_length, step=seq_length, dtype=torch.int32).numpy().tolist()) - output = torch_npu.npu_fusion_attention( - query, key, value, head_num=n_head, input_layout="TND", - keep_prob=1. - self.attention_dropout.p, - actual_seq_qlen=cu_seqlens, actual_seq_kvlen=cu_seqlens, - scale=scale, - )[0] - output = output.reshape(batch_size, seq_length, n_head, head_dim).contiguous() - output = output.reshape(batch_size, seq_length, -1).transpose(0, 1) - - return output + cp_group = get_context_parallel_group_for_hybrid_ring() + cp_size = get_context_parallel_for_hybrid_ring_world_size() + rank = get_context_parallel_for_hybrid_ring_rank() + cp_global_ranks = get_context_parallel_for_hybrid_ring_global_ranks() + + cp_para = dict() + cp_para['causal'] = args.cp_attention_mask_type == 'causal' + cp_para['cp_group'] = cp_group + cp_para['cp_size'] = cp_size + cp_para['rank'] = rank + cp_para['cp_global_ranks'] = cp_global_ranks + cp_para['cp_group_for_send_recv_overlap'] = mpu.get_context_parallel_group_for_send_recv_overlap() \ + if args.use_cp_send_recv_overlap else None + cp_para['pse'] = self.pse + cp_para['pse_type'] = self.pse_type + output = ringattn_context_parallel(query, key, value, n_head, cp_para, scale, attention_mask, self.attention_dropout.p) else: - seq_length, batch_size, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3] - - query, key, value = [x.transpose(0, 1) for x in [query, key, value]] - - scale = 1.0 / math.sqrt(self.hidden_size_per_attention_head) if self.scale_mask_softmax.scale is None else self.softmax_scale - - if args.context_parallel_size > 1 and args.context_parallel_algo in ['megatron_cp_algo', 'hybrid_cp_algo']: - in_hybrid_mode = False - if get_context_parallel_group_for_hybrid_ring(check_initialized=False) is not None: - in_hybrid_mode = True - - if not in_hybrid_mode: - cp_group = mpu.get_context_parallel_group() - cp_size = mpu.get_context_parallel_world_size() - rank = mpu.get_context_parallel_rank() - cp_global_ranks = mpu.get_context_parallel_global_ranks() - else: - cp_group = get_context_parallel_group_for_hybrid_ring() - cp_size = get_context_parallel_for_hybrid_ring_world_size() - rank = get_context_parallel_for_hybrid_ring_rank() - cp_global_ranks = get_context_parallel_for_hybrid_ring_global_ranks() - - cp_para = dict() - cp_para['causal'] = args.cp_attention_mask_type == 'causal' - cp_para['cp_group'] = cp_group - cp_para['cp_size'] = cp_size - cp_para['rank'] = rank - cp_para['cp_global_ranks'] = cp_global_ranks - cp_para['cp_group_for_send_recv_overlap'] = mpu.get_context_parallel_group_for_send_recv_overlap() \ - if args.use_cp_send_recv_overlap else None - cp_para['pse'] = self.pse - cp_para['pse_type'] = self.pse_type - output = ringattn_context_parallel(query, key, value, n_head, cp_para, scale, attention_mask, self.attention_dropout.p) + if args.use_fusion_attn_v2: + output = npu_fusion_attention( + query, key, value, n_head, 'SBH', + pse=self.pse, + padding_mask=None, + atten_mask=attention_mask, + scale=scale, + pse_type=self.pse_type, + pre_tokens=args.pre_tockens, + next_tokens=args.next_tockens, + keep_prob=1 - self.dropout_p, + inner_precise=0, + sparse_mode=args.sparse_mode + )[0] else: - if args.use_fusion_attn_v2: - output = npu_fusion_attention( - query, key, value, n_head, 'SBH', - pse=self.pse, - padding_mask=None, - atten_mask=attention_mask, - scale=scale, - pse_type=self.pse_type, - pre_tokens=args.pre_tockens, - next_tokens=args.next_tockens, - keep_prob=1 - self.dropout_p, - inner_precise=0, - sparse_mode=args.sparse_mode - )[0] - else: - output = torch_npu.npu_fusion_attention(query, key, value, n_head, "BSND", - keep_prob=1. - self.attention_dropout.p, - scale=scale, - atten_mask=attention_mask, )[0] - output = output.transpose(0, 1).reshape(seq_length, batch_size, -1) - - return output + output = torch_npu.npu_fusion_attention(query, key, value, n_head, "BSND", + keep_prob=1. - self.attention_dropout.p, + scale=scale, + atten_mask=attention_mask, )[0] + output = output.transpose(0, 1).reshape(seq_length, batch_size, -1) + + return output diff --git a/examples/internvl2/evaluate_internvl2_8B.sh b/examples/internvl2/evaluate_internvl2_8B.sh index 277b6c2b..847b4388 100644 --- a/examples/internvl2/evaluate_internvl2_8B.sh +++ b/examples/internvl2/evaluate_internvl2_8B.sh @@ -8,7 +8,6 @@ export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=7200 export HCCL_EXEC_TIMEOUT=7200 export CUDA_DEVICE_MAX_CONNECTIONS=1 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 export TOKENIZERS_PARALLELISM=false export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True diff --git a/examples/internvl2/finetune_internvl2_2B.sh b/examples/internvl2/finetune_internvl2_2B.sh index bab2cff6..40b1a704 100644 --- a/examples/internvl2/finetune_internvl2_2B.sh +++ b/examples/internvl2/finetune_internvl2_2B.sh @@ -7,15 +7,14 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export CUDA_DEVICE_MAX_CONNECTIONS=1 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 -GPUS_PER_NODE=8 +NPUS_PER_NODE=8 MASTER_ADDR=localhost MASTER_PORT=6000 NNODES=1 NODE_RANK=0 -WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MBS=1 GRAD_ACC_STEP=16 @@ -38,7 +37,7 @@ MM_ARGS=" " DISTRIBUTED_ARGS=" - --nproc_per_node $GPUS_PER_NODE \ + --nproc_per_node $NPUS_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ diff --git a/examples/internvl2/finetune_internvl2_76B.sh b/examples/internvl2/finetune_internvl2_76B.sh index 0634e8ba..ffee38ad 100644 --- a/examples/internvl2/finetune_internvl2_76B.sh +++ b/examples/internvl2/finetune_internvl2_76B.sh @@ -7,10 +7,9 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export CUDA_DEVICE_MAX_CONNECTIONS=1 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 -GPUS_PER_NODE=16 +NPUS_PER_NODE=16 MASTER_PORT=6000 HOSTFILE='./hostfile' NODEADDR=$(hostname -I | awk -F " " '{print$1}') @@ -18,7 +17,7 @@ NODE_RANK=$(awk '{ranks[$1]=(FNR-1);}END{print ranks["'$NODEADDR'"];}' $HOSTFILE NNODES=$(wc -l $HOSTFILE) MASTER_ADDR=$(head -n 1 $HOSTFILE | awk '{print $1;}') -WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MBS=1 GRAD_ACC_STEP=128 @@ -41,7 +40,7 @@ MM_ARGS=" " DISTRIBUTED_ARGS=" - --nproc_per_node $GPUS_PER_NODE \ + --nproc_per_node $NPUS_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ diff --git a/examples/internvl2/finetune_internvl2_8B.sh b/examples/internvl2/finetune_internvl2_8B.sh index f39f06de..071877ce 100644 --- a/examples/internvl2/finetune_internvl2_8B.sh +++ b/examples/internvl2/finetune_internvl2_8B.sh @@ -7,15 +7,14 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export CUDA_DEVICE_MAX_CONNECTIONS=1 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 -GPUS_PER_NODE=8 +NPUS_PER_NODE=8 MASTER_ADDR=localhost MASTER_PORT=6000 NNODES=1 NODE_RANK=0 -WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MBS=1 GRAD_ACC_STEP=64 @@ -38,7 +37,7 @@ MM_ARGS=" " DISTRIBUTED_ARGS=" - --nproc_per_node $GPUS_PER_NODE \ + --nproc_per_node $NPUS_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ diff --git a/examples/internvl2/finetune_internvl2_8B_vpp.sh b/examples/internvl2/finetune_internvl2_8B_vpp.sh index 21c3e28f..5dc5f301 100644 --- a/examples/internvl2/finetune_internvl2_8B_vpp.sh +++ b/examples/internvl2/finetune_internvl2_8B_vpp.sh @@ -7,15 +7,14 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export CUDA_DEVICE_MAX_CONNECTIONS=1 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 -GPUS_PER_NODE=8 +NPUS_PER_NODE=8 MASTER_ADDR=localhost MASTER_PORT=6000 NNODES=1 NODE_RANK=0 -WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MBS=1 GRAD_ACC_STEP=64 @@ -39,7 +38,7 @@ MM_ARGS=" " DISTRIBUTED_ARGS=" - --nproc_per_node $GPUS_PER_NODE \ + --nproc_per_node $NPUS_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ diff --git a/examples/internvl2/inference_internvl.sh b/examples/internvl2/inference_internvl.sh index 958b6975..bd132002 100644 --- a/examples/internvl2/inference_internvl.sh +++ b/examples/internvl2/inference_internvl.sh @@ -7,15 +7,14 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export CUDA_DEVICE_MAX_CONNECTIONS=1 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 -GPUS_PER_NODE=1 +NPUS_PER_NODE=1 MASTER_ADDR=localhost MASTER_PORT=6000 NNODES=1 NODE_RANK=0 -WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MBS=1 GRAD_ACC_STEP=1 @@ -32,7 +31,7 @@ MM_ARGS=" " DISTRIBUTED_ARGS=" - --nproc_per_node $GPUS_PER_NODE \ + --nproc_per_node $NPUS_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ diff --git a/examples/internvl2/internvl2_convert_mm_to_hg.py b/examples/internvl2/internvl2_convert_mm_to_hf.py similarity index 92% rename from examples/internvl2/internvl2_convert_mm_to_hg.py rename to examples/internvl2/internvl2_convert_mm_to_hf.py index 3f6c1991..152c6763 100644 --- a/examples/internvl2/internvl2_convert_mm_to_hg.py +++ b/examples/internvl2/internvl2_convert_mm_to_hf.py @@ -1,10 +1,11 @@ import argparse -import os import json +import os from pathlib import Path +from dataclasses import dataclass + import torch from safetensors.torch import save_file -from dataclasses import dataclass @dataclass @@ -68,11 +69,9 @@ def check_pp_config(_model_config_dict=None): def merge_by_pp(pp_ckpt_file, pp_rank: int, _model_config_dict=None): - # vit: [24, 0, 0, 0] - # llm: [6, 9, 9, 8] _vit_pipeline_num_layers = _model_config_dict.vit_pipeline_num_layers _llm_pipeline_num_layers = _model_config_dict.llm_pipeline_num_layers - + vit_pp_start_index = 0 llm_pp_start_index = 0 if pp_rank > 0: @@ -132,8 +131,8 @@ def split_qkv(wqkv, hn=64, ng=8): return wq, wk, wv -def convert_mm_to_hg(_mm_state_dict, _model_config_dict=None): - _hg_state_dict = {} +def convert_mm_to_hf(_mm_state_dict, _model_config_dict=None): + _hf_state_dict = {} # check LlamaForCausalLM or InternLM2ForCausalLM architectures_key = "text_decoder.decoder.layers.0.self_attention.linear_qkv.weight" is_llama_for_causa_llm = True @@ -189,7 +188,7 @@ def convert_mm_to_hg(_mm_state_dict, _model_config_dict=None): new_key = new_key.replace('linear_fc2', '3') print(f'mapping {key} to {new_key}') - _hg_state_dict[new_key] = value + _hf_state_dict[new_key] = value if is_llama_for_causa_llm: for i in range(_model_config_dict.llm_num_layers): @@ -198,17 +197,17 @@ def convert_mm_to_hg(_mm_state_dict, _model_config_dict=None): v_name = f'language_model.model.layers.{i}.self_attention.wv.weight' qkv_name = f'language_model.model.layers.{i}.attention.wqkv.weight' - if qkv_name in _hg_state_dict.keys(): - wqkv = _hg_state_dict[qkv_name] + if qkv_name in _hf_state_dict.keys(): + wqkv = _hf_state_dict[qkv_name] else: raise AssertionError(f'Missing key {qkv_name}') wq, wk, wv = split_qkv(wqkv) if not (wq and wk and wv): raise ValueError("llama_for_causa_llm split qkv weight error, maybe not support right now.") - _hg_state_dict[q_name] = wq - _hg_state_dict[k_name] = wk - _hg_state_dict[v_name] = wv - _hg_state_dict.pop(qkv_name) + _hf_state_dict[q_name] = wq + _hf_state_dict[k_name] = wk + _hf_state_dict[v_name] = wv + _hf_state_dict.pop(qkv_name) print(f'merge {q_name}, {k_name}, {v_name} to {qkv_name}') # split w1 and w3 weight @@ -217,16 +216,16 @@ def convert_mm_to_hg(_mm_state_dict, _model_config_dict=None): gate_name = f'language_model.model.layers.{i}.feed_forward.w1.weight' up_name = f'language_model.model.layers.{i}.feed_forward.w3.weight' # split w1 和 w3 - if gate_and_up_name in _hg_state_dict.keys(): - gate_and_up_weight = _hg_state_dict[gate_and_up_name] + if gate_and_up_name in _hf_state_dict.keys(): + gate_and_up_weight = _hf_state_dict[gate_and_up_name] # refer to: torch.cat([gate_proj_weight, up_proj_weight], dim=0) gate_weight, up_weight = torch.split(gate_and_up_weight, gate_and_up_weight.size(0) // 2, dim=0) - _hg_state_dict[gate_name] = gate_weight - _hg_state_dict[up_name] = up_weight + _hf_state_dict[gate_name] = gate_weight + _hf_state_dict[up_name] = up_weight # remove useless weight - _hg_state_dict.pop(gate_and_up_name) + _hf_state_dict.pop(gate_and_up_name) print(f'split {gate_and_up_name} to {gate_name} and {up_name}') - return _hg_state_dict + return _hf_state_dict def split_by_index_json(_state_dict, _index_json_path): @@ -251,7 +250,7 @@ def save_by_index_json(_state_dicts, _save_dir): if __name__ == "__main__": - parser = argparse.ArgumentParser(description='mm2hg tools checkpoint utility arguments', + parser = argparse.ArgumentParser(description='mm2hf tools checkpoint utility arguments', allow_abbrev=False, conflict_handler='resolve') parser.add_argument('--model-size', type=str, required=True, @@ -260,7 +259,7 @@ if __name__ == "__main__": help='MindSpeed-MM checkpoint path for loading') parser.add_argument('--save-dir', type=str, required=True, help='HuggingFace checkpoint path for saving') - parser.add_argument('--raw-hg-dir', type=str, required=True, + parser.add_argument('--raw-hf-dir', type=str, required=True, help='original raw huggingface checkpoint path for loading') parser.add_argument('--trust-remote-code', type=str, required=True, default=False, help='Whether not to allow HuggingFace API to execute code') @@ -268,14 +267,14 @@ if __name__ == "__main__": if unrecognized_args: ValueError(f"please check unrecognized args: {unrecognized_args}") - index_json_path = os.path.join(args.raw_hg_dir, "model.safetensors.index.json") + index_json_path = os.path.join(args.raw_hf_dir, "model.safetensors.index.json") if not os.path.exists(index_json_path): raise ValueError(f"safetensors.index.json not in {index_json_path}") model_config_ = get_model_config(args.model_size) check_pp_config(model_config_) merge_state_dict = load_from_mm(args.load_dir, model_config_) - hg_state_dict = convert_mm_to_hg(merge_state_dict, model_config_) - state_dicts = split_by_index_json(hg_state_dict, index_json_path) + hf_state_dict = convert_mm_to_hf(merge_state_dict, model_config_) + state_dicts = split_by_index_json(hf_state_dict, index_json_path) save_by_index_json(state_dicts, args.save_dir) diff --git a/examples/internvl2/internvl_convert_to_mm_ckpt.py b/examples/internvl2/internvl2_convert_to_mm_ckpt.py similarity index 99% rename from examples/internvl2/internvl_convert_to_mm_ckpt.py rename to examples/internvl2/internvl2_convert_to_mm_ckpt.py index 40395cc3..33cdc6ea 100644 --- a/examples/internvl2/internvl_convert_to_mm_ckpt.py +++ b/examples/internvl2/internvl2_convert_to_mm_ckpt.py @@ -8,6 +8,9 @@ import torch from transformers import AutoModelForCausalLM, AutoConfig +llm_arch = '' + + def load_from_hf(load_dir, trust_remote_code): # Load Huggingface model. hf_model = AutoModelForCausalLM.from_pretrained(load_dir, device_map='cpu', trust_remote_code=trust_remote_code, @@ -128,7 +131,7 @@ def merge_qkv(wq, wk, wv, hn=64, ng=8): return qkv -def convert_hg_to_mm(_state_dict, _num_layers): +def convert_hf_to_mm(_state_dict, _num_layers): new_dict = {} for key, value in _state_dict.items(): new_key = None @@ -369,7 +372,7 @@ if __name__ == '__main__': for key, value in state_dict.items(): print(key, value.shape) print(50 * '*') - state_dict = convert_hg_to_mm(state_dict, llm_num_layers) + state_dict = convert_hf_to_mm(state_dict, llm_num_layers) pipeline_state_dicts, remains = split_model_by_pipeline(state_dict, pp_split) if len(remains) > 0: print(remains) diff --git a/examples/llava1.5/README.md b/examples/llava1.5/README.md index 71dcf184..f2ff992a 100644 --- a/examples/llava1.5/README.md +++ b/examples/llava1.5/README.md @@ -99,16 +99,13 @@ torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huaw pip install torch_npu-2.1.0*-cp310-cp310m-linux_aarch64.whl # apex for Ascend 参考 https://gitee.com/ascend/apex - pip install apex-0.1_ascend*-cp310-cp310m-linux_aarch64.whl - - # 将shell脚本中的环境变量路径修改为真实路径,下面为参考路径 - source /usr/local/Ascend/ascend-toolkit/set_env.sh + pip install apex-0.1_ascend*-cp310-cp310m-linux_aarch64.whl # 安装加速库 git clone https://gitee.com/ascend/MindSpeed.git cd MindSpeed # checkout commit from MindSpeed core_r0.6.0 - git checkout 3da17d56 + git checkout ab39de78be23e88e2c8b0d25edf6135940990c02 pip install -r requirements.txt pip3 install -e . cd .. @@ -117,24 +114,6 @@ torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huaw pip install -e . ``` -**注意事项:** - - 需要修改 mindspeed/core/transformer/dot_product_attention.py的65行,修改如下: - -```python -def dot_product_attention_forward_wrapper(fn): - @wraps(fn) - def wrapper(self, query, key, value, attention_mask, attn_mask_type, packed_seq_params): - # 注释下一行 - # attention_mask = get_attention_mask() - if get_args().use_flash_attn: - return dot_product_attention_forward(self, query, key, value, attention_mask, attn_mask_type, packed_seq_params) - return fn(self, query, key, value, attention_mask, attn_mask_type, packed_seq_params) - - return wrapper -``` - ---- @@ -152,90 +131,36 @@ def dot_product_attention_forward_wrapper(fn): -#### 2. 权重转换 +#### 2. 权重转换(当前依赖openai-clip库,正在规划重构) MindSpeeed-MM修改了部分原始网络的结构名称,因此需要使用如下脚本代码对下载的预训练权重进行转换。 当前训练只使用了ViT-L-14-336px和lmsys/vicuna-7b-v1.5两个模型,以下介绍这两个模型从开源仓转换成MindSpeeed-MM所需权重的方法: - ViT-L-14-336px权重转换 - 参考 NVIDIA/Megatron-LM中[Vision model](https://github.com/NVIDIA/Megatron-LM/blob/main/examples/multimodal/README.md#vision-model) , - 执行如下命令: - - ``` - python examples/multimodal/clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 1 --use-te - ``` - - 如果执行环境连接不到外网下载ViT-L-14-336px模型,建议手动下载,再在clip_converter.py中将ViT-L-14-336px路径修改成本地路径 + 脚本参考 NVIDIA/Megatron-LM中[Vision model](https://github.com/NVIDIA/Megatron-LM/blob/core_r0.8.0/examples/multimodal/README.md#vision-model) ,将[ViT-L-14-336px](https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt)权重下载到本地后, +  执行如下命令: + ```bash + # 安装依赖(加载原始权重需要依赖openai-clip库) +   pip install git+https://github.com/openai/CLIP.git - ``` - model, _ = clip.load("{dir_to_model}/ViT-L-14-336px.pt", device=device, download_root="") +   python examples/llava1.5/clip_converter.py \ + --download-root {dir_to_model}/ViT-L-14-336px.pt \ + --output {target_dir} ``` - 其中{dir_to_model}为模型所在的路径。 - 转换的结果在: /some/output/folder/iter_0000001/mp_rank_00/model_optim_rng.pt - - 对于转换后的结果,需要再执行如下转换,其中{target_dir}为最终的权重文件保存路径: - - ```python - before = torch.load("/some/output/folder/iter_0000001/mp_rank_00/model_optim_rng.pt")["model"] - torch.save(before, "{target_dir}/converted_clip.pt") - ``` + 其中{dir_to_model}为下载模型权重所在的路径,转换后权重将保存在{target_dir}/converted_clip.pt。 - lmsys/vicuna-7b-v1.5权重转换 - 参考[ModelLink](https://gitee.com/ascend/ModelLink/blob/master/examples/README.md#21-huggingface%E6%9D%83%E9%87%8D%E8%BD%AC%E6%8D%A2%E5%88%B0megatron-lm%E6%A0%BC%E5%BC%8F)中语言模型权重转换的脚本: - + 下载权重后执行如下命令: ```shell - source {cann_dir}/ascend-toolkit/set_env.sh - HF_FORMAT_DIR="{dir_to_model}/vicuna-7b-v1.5" - MEGATRON_FORMAT_DIR="{target_dir}" - TOKENIZER_MODEL="{dir_to_model}/vicuna-7b-v1.5/tokenizer.model" - python tools/checkpoint/convert_ckpt.py \ - --model-type GPT \ - --loader llama2_hf \ - --saver megatron \ - --target-tensor-parallel-size 1 \ - --target-pipeline-parallel-size 1 \ - --load-dir ${HF_FORMAT_DIR} \ - --save-dir ${MEGATRON_FORMAT_DIR} \ - --tokenizer-model ${TOKENIZER_MODEL} \ - --params-dtype bf16 + python examples/llava1.5/vicuna_converter.py \ + --load-dir {dir_to_model}/vicuna-7b-v1.5 \ + --save-dir {target_dir} \ + --trust-remote-code True # 为保证代码安全,配置trust_remote_code默认为False,用户需要设置为True,并且确保自己下载的模型和数据的安全性 ``` - - 其中: {dir_to_model}为vicuna-7b-v1.5所在路径,{target_dir}为转换结果文件路径, {cann_dir}为cann包安装路径。转换的结果在:{target_dir}/iter_0000001/mp_rank_00/model_optim_rng.pt。 - -由于MindSpeed-MM中模型变量名称跟转换结果有差异,需要再做一次适配: - -- 在megatron同级目录,创建convert.py脚本,将如下代码复制到convert.py中, -- 修改{target_dir}为上一步model_optim_rng.pt所在路径, -- 修改{dir_to_save_file}为结果文件所在路径, -- 执行命令:python convert.py - - ```python - import torch - def convert_param(): - ckp = torch.load("{target_dir}/model_optim_rng.pt")["model"]["language_model"] - target_ckp = {} - target_ckp["embedding.word_embeddings.weight"] = ckp["embedding"]["word_embeddings"]["weight"] - target_ckp["output_layer.weight"] = ckp["output_layer"]["weight"] - for encode_key in ckp["encoder"].keys(): - if ckp["encoder"][encode_key] is not None: - targetkey = encode_key.replace("input_norm", "input_layernorm") - targetkey = targetkey.replace(".dense.", ".linear_proj.") - targetkey = targetkey.replace("query_key_value", "linear_qkv") - targetkey = targetkey.replace("post_attention_norm", "pre_mlp_layernorm") - targetkey = targetkey.replace("dense_h_to_4h", "linear_fc1") - targetkey = targetkey.replace("dense_4h_to_h", "linear_fc2") - targetkey = targetkey.replace("final_norm", "final_layernorm") - targetkey = "decoder." + targetkey - target_ckp[targetkey] = ckp["encoder"][encode_key] - torch.save(target_ckp, "{dir_to_save_file}/converted_vicuna.pt") - - if __name__ == "__main__": - convert_param() - ``` - ---- + 其中{dir_to_model}为下载模型权重所在的路径,转换后权重将保存在{target_dir}/converted_vicuna.pt。 + @@ -299,6 +224,26 @@ MindSpeeed-MM修改了部分原始网络的结构名称,因此需要使用如 ... } ``` +根据实际情况修改`model.json`中的权重路径为转换后权重,无需预训练权重则传入null。 +```json +{ +    ... +    "text_decoder": { +      ... +      "ckpt_path": "//converted_vicuna.pt" +    }, +    "image_encoder": { +      "vision_encoder":{ +        ... +        "ckpt_path": "//converted_clip.pt" +      }, +      "vision_projector":{ +        ... +        "ckpt_path": null +      } +    } +} +``` 【模型保存加载配置】 @@ -350,12 +295,12 @@ $save_dir ```shell # 根据实际情况修改 ascend-toolkit 路径 source /usr/local/Ascend/ascend-toolkit/set_env.sh - GPUS_PER_NODE=8 + NPUS_PER_NODE=8 MASTER_ADDR=locahost MASTER_PORT=29501 NNODES=1 NODE_RANK=0 - WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES)) + WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES)) ``` diff --git a/examples/llava1.5/clip_converter.py b/examples/llava1.5/clip_converter.py new file mode 100644 index 00000000..db5caaf8 --- /dev/null +++ b/examples/llava1.5/clip_converter.py @@ -0,0 +1,147 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +import argparse +import os + +import clip +import torch + + +def convert(download_root, output_path, tensor_parallel_size, use_te_layernorm_linear): + device = "cpu" + + model, _ = clip.load(download_root, device=device) + + state_dict = model.state_dict() + new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)] + + # Indices from mapping pytorch multihead attention to megatron. + kv_channels = 64 + hidden_dim = 1024 + num_heads = 16 + indices = [] + for i in range(num_heads): + lb = i * kv_channels + ub = (i + 1) * kv_channels + indices.append(torch.arange(lb, ub, dtype=torch.int)) + indices.append(torch.arange(hidden_dim + lb, hidden_dim + ub, dtype=torch.int)) + indices.append(torch.arange(2 * hidden_dim + lb, 2 * hidden_dim + ub, dtype=torch.int)) + + indices = torch.cat(indices) + + for name, tensor in state_dict.items(): + # Skip text model. + if "visual" not in name: + continue + + # Skip final layers not used in our model. + if name == "visual.proj" or "ln_post" in name: + continue + + # Map parameter names to ones used in megatron. + new_name = "" + new_tensor = tensor + if new_tensor.dtype == torch.float16: + new_tensor = new_tensor.to(torch.float32) + + # This is used for chunking some tensors to target tensor parallel size. + chunk_dim = None + + if "class_embedding" in name: + new_name = "class_token" + # Our model uses class token that is expanded to input dimensions already. + new_tensor = new_tensor.expand(1, 1, -1) + elif "positional_embedding" in name: + new_name = "position_embeddings.weight" + elif "conv1" in name: + new_name = "conv1.weight" + elif "ln_pre.weight" in name: + new_name = "ln_pre.weight" + elif "ln_pre.bias" in name: + new_name = "ln_pre.bias" + elif "transformer.resblocks" in name: + layer_idx = name.split(".")[3] + base = f"decoder.layers.{layer_idx}" + + if "attn.in_proj_weight" in name: + new_name = f"{base}.self_attention.linear_qkv.weight" + new_tensor = new_tensor[indices] + chunk_dim = 0 + elif "attn.in_proj_bias" in name: + new_name = f"{base}.self_attention.linear_qkv.bias" + new_tensor = new_tensor[indices] + chunk_dim = 0 + elif "attn.out_proj.weight" in name: + new_name = f"{base}.self_attention.linear_proj.weight" + chunk_dim = 1 + elif "attn.out_proj.bias" in name: + new_name = f"{base}.self_attention.linear_proj.bias" + elif "ln_1.weight" in name: + new_name = f"{base}.input_layernorm.weight" + if use_te_layernorm_linear: + new_name = f"{base}.self_attention.linear_qkv.layer_norm_weight" + elif "ln_1.bias" in name: + new_name = f"{base}.input_layernorm.bias" + if use_te_layernorm_linear: + new_name = f"{base}.self_attention.linear_qkv.layer_norm_bias" + elif "mlp.c_fc.weight" in name: + new_name = f"{base}.mlp.linear_fc1.weight" + chunk_dim = 0 + elif "mlp.c_fc.bias" in name: + new_name = f"{base}.mlp.linear_fc1.bias" + chunk_dim = 0 + elif "mlp.c_proj.weight" in name: + new_name = f"{base}.mlp.linear_fc2.weight" + chunk_dim = 1 + elif "mlp.c_proj.bias" in name: + new_name = f"{base}.mlp.linear_fc2.bias" + elif "ln_2.weight" in name: + new_name = f"{base}.pre_mlp_layernorm.weight" + if use_te_layernorm_linear: + new_name = f"{base}.mlp.linear_fc1.layer_norm_weight" + elif "ln_2.bias" in name: + new_name = f"{base}.pre_mlp_layernorm.bias" + if use_te_layernorm_linear: + new_name = f"{base}.mlp.linear_fc1.layer_norm_bias" + + if new_name == "": + raise AssertionError(f"unexpected layer name {name}") + + if chunk_dim is None: + new_tensors = [new_tensor for _ in range(tensor_parallel_size)] + else: + new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim) + + for i in range(tensor_parallel_size): + # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage. + new_state_dicts[i]["model"][new_name] = new_tensors[i].clone() + + output_path_pt = os.path.join(output_path, "converted_clip.pt") + torch.save(new_state_dicts[0]["model"], output_path_pt) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description=""" +Convert OpenAI CLIP VIT weights to megatron format. + + +Example usage: +python examples/llava1.5/clip_converter.py \ + --download-root {dir_to_model}/ViT-L-14-336px.pt \ + --output {target_dir} +""", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + parser.add_argument( + "--download-root", type=str, required=True, help="Download folder for OpenAI CLIP weights", + ) + parser.add_argument( + "--output", type=str, required=True, help="output directory for megatron state dict file(s)" + ) + + args = parser.parse_args() + + convert(args.download_root, args.output, 1, False) + + print("all weights have been converted.") \ No newline at end of file diff --git a/examples/llava1.5/evaluate_llava1_5.sh b/examples/llava1.5/evaluate_llava1_5.sh index c85479d7..aa5ea277 100644 --- a/examples/llava1.5/evaluate_llava1_5.sh +++ b/examples/llava1.5/evaluate_llava1_5.sh @@ -14,7 +14,7 @@ MASTER_ADDR=localhost MASTER_PORT=29501 NNODES=1 NODE_RANK=0 -WORLD_SIZE=$((NPUS_PER_NODE*$NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) TP=1 PP=1 diff --git a/examples/llava1.5/inference_llava1_5.sh b/examples/llava1.5/inference_llava1_5.sh index e225e4a5..e8d183d9 100644 --- a/examples/llava1.5/inference_llava1_5.sh +++ b/examples/llava1.5/inference_llava1_5.sh @@ -8,12 +8,12 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 -GPUS_PER_NODE=1 +NPUS_PER_NODE=1 MASTER_ADDR=localhost MASTER_PORT=29501 NNODES=1 NODE_RANK=0 -WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) TP=1 PP=1 @@ -30,7 +30,7 @@ MM_MODEL="examples/llava1.5/inference_llava.json" DISTRIBUTED_ARGS=" - --nproc_per_node $GPUS_PER_NODE \ + --nproc_per_node $NPUS_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ diff --git a/examples/llava1.5/pretrain_llava1_5.sh b/examples/llava1.5/pretrain_llava1_5.sh index fa5679f0..58276e71 100644 --- a/examples/llava1.5/pretrain_llava1_5.sh +++ b/examples/llava1.5/pretrain_llava1_5.sh @@ -8,17 +8,17 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 -GPUS_PER_NODE=8 +NPUS_PER_NODE=8 MASTER_ADDR=localhost MASTER_PORT=29501 NNODES=1 NODE_RANK=0 -WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) TP=1 PP=1 CP=1 -MBS=1 +MBS=8 GBS=$(($WORLD_SIZE*$MBS/$CP)) MM_DATA="./examples/llava1.5/data.json" @@ -28,7 +28,7 @@ LOAD_PATH="save_dir" SAVE_PATH="save_dir" DISTRIBUTED_ARGS=" - --nproc_per_node $GPUS_PER_NODE \ + --nproc_per_node $NPUS_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ @@ -91,5 +91,5 @@ torchrun $DISTRIBUTED_ARGS \ $GPT_ARGS \ $MM_ARGS \ $OUTPUT_ARGS \ - --distributed-backend nccl >> logs/train_${logfile}.log 2>&1 + --distributed-backend nccl | tee logs/train_${logfile}.log 2>&1 chmod 440 logs/train_${logfile}.log \ No newline at end of file diff --git a/examples/llava1.5/vicuna_converter.py b/examples/llava1.5/vicuna_converter.py new file mode 100644 index 00000000..ab95b13e --- /dev/null +++ b/examples/llava1.5/vicuna_converter.py @@ -0,0 +1,135 @@ +import argparse +import os + +import torch +from transformers import AutoModelForCausalLM, AutoConfig + + +def load_from_hf(load_dir, trust_remote_code): + # Load Huggingface model. + hf_model = AutoModelForCausalLM.from_pretrained(load_dir, device_map='cpu', trust_remote_code=trust_remote_code, + torch_dtype=torch.bfloat16, local_files_only=True) + print(hf_model) + config = AutoConfig.from_pretrained(load_dir, trust_remote_code=trust_remote_code) + + return hf_model, config + + +def merge_qkv(wq, wk, wv, ng=32): + hq, h = wq.shape + hkv = wk.shape[0] + dq = hq // ng + dkv = hkv // ng + d = dq + 2 * dkv + qkv = torch.zeros([hq + hkv * 2, h], dtype=wq.dtype) + for j in range(ng): + qkv[j * d : j * d + dq, :] = wq[j * dq : (j + 1) * dq, :] + qkv[j * d + dq : j * d + dq + dkv, :] = wk[j * dkv : (j + 1) * dkv, :] + qkv[j * d + dq + dkv : j * d + dq + dkv * 2, :] = wv[j * dkv : (j + 1) * dkv, :] + + return qkv + + +def convert_hg_to_mm(_state_dict, _model_config): + _num_layers = _model_config.num_hidden_layers + _num_query_groups = _model_config.num_key_value_heads + new_dict = {} + for key, value in _state_dict.items(): + new_key = key + # 权重映射 + new_key = new_key.replace('model.embed_tokens', 'embedding.word_embeddings') + new_key = new_key.replace('model.layers', 'decoder.layers') + new_key = new_key.replace('self_attn.q_proj', 'self_attention.wq') + new_key = new_key.replace('self_attn.k_proj', 'self_attention.wk') + new_key = new_key.replace('self_attn.v_proj', 'self_attention.wv') + new_key = new_key.replace('self_attn.o_proj', 'self_attention.linear_proj') + new_key = new_key.replace('gate_proj', 'linear_fc1_gate') + new_key = new_key.replace('up_proj', 'linear_fc1_up') + new_key = new_key.replace('down_proj', 'linear_fc2') + new_key = new_key.replace('post_attention_layernorm', 'pre_mlp_layernorm') + new_key = new_key.replace('model.norm', 'decoder.final_layernorm') + new_key = new_key.replace('lm_head', 'output_layer') + + # 打印映射过程 + print(f'mapping {key} to {new_key}') + new_dict[new_key] = value + + # qkv权重交织合并 + for i in range(_num_layers): + q_name = f'decoder.layers.{i}.self_attention.wq.weight' + k_name = f'decoder.layers.{i}.self_attention.wk.weight' + v_name = f'decoder.layers.{i}.self_attention.wv.weight' + qkv_name = f'decoder.layers.{i}.self_attention.linear_qkv.weight' + + if q_name in new_dict.keys(): + wq = new_dict[q_name] + else: + raise AssertionError(f'Missing key {q_name}') + if k_name in new_dict.keys(): + wk = new_dict[k_name] + else: + raise AssertionError(f'Missing key {k_name}') + if v_name in new_dict.keys(): + wv = new_dict[v_name] + else: + raise AssertionError(f'Missing key {v_name}') + wqkv = merge_qkv(wq, wk, wv, _num_query_groups) + new_dict[qkv_name] = wqkv + new_dict.pop(q_name) + new_dict.pop(k_name) + new_dict.pop(v_name) + + print(f'merge {q_name}, {k_name}, {v_name} to {qkv_name}') + + # 合并mlp的gate和up权重 + for i in range(_num_layers): + gate_name = f'decoder.layers.{i}.mlp.linear_fc1_gate.weight' + up_name = f'decoder.layers.{i}.mlp.linear_fc1_up.weight' + fc1_name = f'decoder.layers.{i}.mlp.linear_fc1.weight' + + # 合并 w1 和 w3 + if gate_name in new_dict.keys(): + gate_proj_weight = new_dict[gate_name] + if up_name in new_dict.keys(): + up_proj_weight = new_dict[up_name] + linear_fc1 = torch.cat([gate_proj_weight, up_proj_weight], dim=0) + new_dict[fc1_name] = linear_fc1 + + # 移除合并前的权重 + new_dict.pop(gate_name) + new_dict.pop(up_name) + + print(f'merge {gate_name} and {up_name} to {fc1_name}') + + return new_dict + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Megatron Checkpoint Utility Arguments', + allow_abbrev=False, + conflict_handler='resolve') + parser.add_argument('--load-dir', type=str, required=True, + help='HuggingFace weight path for loading') + parser.add_argument('--save-dir', type=str, required=True, + help='MindSpeed-MM weight path for saving') + parser.add_argument('--trust-remote-code', type=str, required=True, default=False, + help='Whether or not to allow HuggingFace API to execute code') + args, unrecognized_args = parser.parse_known_args() + if unrecognized_args: + print(f"Unrecognized Args: {unrecognized_args}") + + hf_model, model_config = load_from_hf(args.load_dir, args.trust_remote_code) + state_dict = hf_model.state_dict() + print(50 * '*') + print('origin state_dict:') + for key, value in state_dict.items(): + print(key, value.shape) + print(50 * '*') + new_state_dict = convert_hg_to_mm(state_dict, model_config) + print('new state_dict:') + for key, value in new_state_dict.items(): + print(key, value.shape) + print(50 * '*') + output_path = os.path.join(args.save_dir, 'converted_vicuna.pt') + torch.save(new_state_dict, output_path) + print('all weights have been converted.') \ No newline at end of file diff --git a/examples/qwen2vl/README.md b/examples/qwen2vl/README.md index 954ee17b..0994bb20 100644 --- a/examples/qwen2vl/README.md +++ b/examples/qwen2vl/README.md @@ -49,7 +49,7 @@ #### 1. 仓库拉取 ```shell -git clone https://gitee.com/ascend/MindSpeed-MM.git +git clone https://gitee.com/ascend/MindSpeed-MM.git git clone https://github.com/NVIDIA/Megatron-LM.git cd Megatron-LM git checkout core_r0.6.0 @@ -61,8 +61,6 @@ mkdir data mkdir ckpt ``` - - #### 2. 环境搭建 torch npu 与 CANN包参考链接:[安装包参考链接](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software) @@ -74,7 +72,7 @@ conda activate test # 安装 torch 和 torch_npu,注意要选择对应python版本、x86或arm的torch、torch_npu及apex包 # 下载路径参考 https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html -pip install torch-2.1.0-cp310-cp310m-manylinux2014_aarch64.whl +pip install torch-2.1.0-cp310-cp310m-manylinux2014_aarch64.whl pip install torch_npu-2.1.0*-cp310-cp310m-linux_aarch64.whl # apex for Ascend 参考 https://gitee.com/ascend/apex @@ -85,7 +83,7 @@ git clone https://gitee.com/ascend/MindSpeed.git cd MindSpeed # checkout commit from MindSpeed core_r0.6.0 git checkout ab39de78be23e88e2c8b0d25edf6135940990c02 -pip install -r requirements.txt +pip install -r requirements.txt pip3 install -e . cd .. # 替换MindSpeed中的文件 @@ -119,53 +117,49 @@ MindSpeed-MM修改了部分原始网络的结构名称,使用examples/qwen2vl/ 修改qwen2vl_convert_to_mm_ckpt.py中的如下内容,与实际保持一致: ```python -hg_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-72B-Instruct' # huggingface权重目录 +hf_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-72B-Instruct' # huggingface权重目录 mm_save_dir = 'ckpt/mm_path/Qwen2-VL-72B-Instruct' # 转换后保存目录 -pp_size = 16 # 切分的PPstage数量,注意要和finetune脚本中配置的PP一致 +model_size = "72B" # 根据需要转换的模型,指定配置( 2B 7B 72B ) +#model parameters +model_config = MODEL_CONFIG_DICT[model_size] -llm_num_layers = 80 #LLM的总层数 +#PP parameters: 72B +pp_size = 16 # 切分的PPstage数量,注意要和finetune脚本中配置的PP一致 llm_pipeline_num_layers = [4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 4] # LLM在每个卡上切分的层数,和为 llm_num_layers,注意要和model.json中配置的pipeline_num_layers一致 - -vit_num_layers = 32 # vit的总层数 vit_pipeline_num_layers = [32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] # vit在每个卡上切分的层数,和为 vit_num_layers,注意要和model.json中配置的pipeline_num_layers一致 -vit_hidden_size = 1280 # vit的隐藏层size -vit_attention_heads_num = 16 # vit的注意力heads数 ``` 以Qwen2VL-7B为例 修改qwen2vl_convert_to_mm_ckpt.py中的如下内容,与实际保持一致: ```python -hg_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-7B-Instruct' # huggingface权重目录 -mm_save_dir = 'ckpt/mm_path/Qwen2-VL-7B-Instruct' # 转换后保存目录 -pp_size = 4 # 切分的PPstage数量 - -llm_num_layers = 28 #LLM的总层数 -llm_pipeline_num_layers = [1, 6, 11, 10] # LLM在每个卡上切分的层数,和为 llm_num_layers +hf_ckpt_dir = "ckpt/hf_path/Qwen2-VL-7B-Instruct" #hf原始的权重保存路径 +mm_save_dir = 'ckpt/mm_path/Qwen2-VL-7B-Instruct' #转换后的权重保存路径 +model_size = "7B" # 根据需要转换的模型,指定配置( 2B 7B 72B ) +#model parameters +model_config = MODEL_CONFIG_DICT[model_size] -vit_num_layers = 32 # vit的总层数 -vit_pipeline_num_layers = [32, 0, 0, 0] # vit在每个卡上切分的层数,和为 vit_num_layers - -vit_hidden_size = 1280 # vit的隐藏层size -vit_attention_heads_num = 16 # vit的注意力heads数 +#PP parameters: 7B +pp_size = 4 +vit_pipeline_num_layers = [32, 0, 0, 0] # LLM在每个卡上切分的层数,和为llm_num_layers,注意要和model.json中配置的pipeline_num_layers一致 +llm_pipeline_num_layers = [1, 6, 11, 10] # vit在每个卡上切分的层数,和为vit_num_layers,注意要和model.json中配置的pipeline_num_layers一致 ``` 以Qwen2VL-2B为例 修改qwen2vl_convert_to_mm_ckpt.py中的如下内容,与实际保持一致: ```python -hg_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-2B-Instruct' # huggingface权重目录 +hf_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-2B-Instruct' # huggingface权重目录 mm_save_dir = 'ckpt/mm_path/Qwen2-VL-2B-Instruct' # 转换后保存目录 -pp_size = 1 # 2B不需要切分PP - -llm_num_layers = 28 #LLM的总层数 -llm_pipeline_num_layers = [28] # LLM在每个卡上切分的层数,和为llm_num_layers +model_size = "2B" # 根据需要转换的模型,指定配置( 2B 7B 72B ) +#model parameters +model_config = MODEL_CONFIG_DICT[model_size] -vit_num_layers = 32 # vit的总层数 -vit_pipeline_num_layers = [32] # vit在每个卡上切分的层数,和为vit_num_layers +#PP parameters: 2B +pp_size = 1 # 2B不需要切分PP +llm_pipeline_num_layers = [28] # LLM在每个卡上切分的层数,和为llm_num_layers,注意要和model.json中配置的pipeline_num_layers一致 +vit_pipeline_num_layers = [32] # vit在每个卡上切分的层数,和为vit_num_layers,注意要和model.json中配置的pipeline_num_layers一致 -vit_hidden_size = 1280 # vit的隐藏层size -vit_attention_heads_num = 16 # vit的注意力heads数 ``` 启动脚本 @@ -199,7 +193,7 @@ LOAD_PATH="ckpt/Qwen2-VL-7B-Instruct" ├── data ├── COCO2017 ├── train2017 - + ├── llava_instruct_150k.json ├── mllm_format_llava_instruct_data.json ... @@ -212,6 +206,32 @@ dataset_param->basic_parameters->dataset 同时注意`data.json`中`dataset_param->basic_parameters->max_samples`的配置,会限制数据只读`max_samples`条,这样可以快速验证功能。如果正式训练时,可以把该参数去掉则读取全部的数据。 +#### 2.纯文本或有图无图混合训练数据(以LLaVA-Instruct-150K为例): +现在本框架已经支持纯文本/混合数据(有图像和无图像数据混合训练)。 + +在数据构造时,对于包含图片的数据,需要保留`image`这个键值。 +```python +{ + "id": your_id, + "image": your_image_path, + "conversations": [ + {"from": "human", "value": your_query}, + {"from": "gpt", "value": your_response}, + ], +} +``` + +在数据构造时,对于纯文本数据,可以去除`image`这个键值。 +```python +{ + "id": your_id, + "conversations": [ + {"from": "human", "value": your_query}, + {"from": "gpt", "value": your_response}, + ], +} +``` + ## 微调 @@ -262,7 +282,7 @@ dataset_param->basic_parameters->dataset ```shell ... # 加载路径 -LOAD_PATH="ckpt/Qwen2-VL-7B-Instruct" +LOAD_PATH="ckpt/mm_path/Qwen2-VL-7B-Instruct" # 保存路径 SAVE_PATH="save_dir" ... @@ -297,13 +317,13 @@ $save_dir ```shell # 根据实际情况修改 ascend-toolkit 路径 -source /usr/local/Ascend/ascend-toolkit/set_env.sh -GPUS_PER_NODE=8 +source /usr/local/Ascend/ascend-toolkit/set_env.sh +NPUS_PER_NODE=8 MASTER_ADDR=locahost MASTER_PORT=29501 NNODES=1 NODE_RANK=0 -WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES)) ``` @@ -313,20 +333,10 @@ WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES)) 以Qwen2VL-7B为例,启动微调训练任务。 ```shell -bash examples/qawen2vl/finetune_qwen2vl_7b.sh +bash examples/qwen2vl/finetune_qwen2vl_7b.sh ``` - -## LoRA - -LoRA为框架通用能力,如需在基线脚本上增加LoRA能力请参考LoRA特性文档[docs/features/lora_finetune.md](https://gitee.com/ascend/MindSpeed-MM/blob/master/docs/features/lora_finetune.md) - - - - - - ## 推理 #### 1、准备工作(以微调环境为基础,包括环境安装、权重下载及转换-目前支持PP切分的推理) @@ -369,19 +379,54 @@ model_path = "Qwen2-VL-7B-Instruct" # hf原仓目录 修改qwen2vl_convert_to_hf.py中的如下内容,与qwen2vl_convert_to_mm_ckpt.py保持一致: ```python pp_size = 4 -vit_num_layers = 32 vit_pipeline_num_layers = [32, 0, 0, 0] -llm_num_layers = 28 llm_pipeline_num_layers = [1, 6, 11, 10] +``` +在qwen2vl_convert_to_hf.py中根据模型选择模型配置 +```python +model_size = "7B" # 根据需要转换的模型,指定配置( 2B 7B 72B ) +#model parameters +model_config = MODEL_CONFIG_DICT[model_size] +``` -vit_hidden_size = 1280 -vit_attention_heads_num = 16 +#### 3.执行转换脚本 +```bash +python examples/qwen2vl/qwen2vl_convert_to_hf.py ``` +## 训练后重新切分权重(pp切分) + +权重下载及转换部分会把权重进行pp切分,在微调后,如果需要对权重重新进行pp切分,可使用examples/qwen2vl/qwen2vl_convert_pp_to_pp.py脚本对微调后的权重进行切分 + +#### 1.修改路径 +修改qwen2vl_convert_pp_to_pp.py中的如下内容,与实际保持一致: +```python +mm_save_dir = "save_dir" # 微调后保存的权重目录 +new_save_dir = "new_pp_save_dir" # 希望重新pp切分后保存的目录 +``` + +#### 2.修改配置 +修改qwen2vl_convert_to_hf.py中的如下内容,与qwen2vl_convert_to_mm_ckpt.py保持一致: +```python +vit_num_layers = 32 +llm_num_layers = 28 +``` + +```python +old_pp_size = 4 +old_vit_pipeline_num_layers = [32, 0, 0, 0] +old_llm_pipeline_num_layers = [1, 6, 11, 10] +``` +修改qwen2vl_convert_to_hf.py中的如下内容,使之与期望的切分配置一致 +```python +new_pp_size = 2 +new_vit_pipeline_num_layers = [32, 0] +new_llm_pipeline_num_layers = [14, 14] +``` #### 3.执行转换脚本 ```bash -python examples/qwen2vl/qwen2vl_convert_to_hf.py +python examples/qwen2vl/qwen2vl_convert_pp_to_pp.py ``` @@ -446,4 +491,4 @@ bash examples/qwen2vl/evaluate_qwen2vl_7b.sh ## 注意事项 1. 在使用流水线并行策略进行多机训练可能会出现卡住现象,可参考[此处](https://gitee.com/ascend/MindSpeed/pulls/1627/files)修改。 2. 在 `finetune_xx.sh`里,与模型结构相关的参数并不生效,以`examples/qwen2vl/model_xb.json`里同名参数配置为准,非模型结构的训练相关参数在 `finetune_xx.sh`修改。 - +3. LoRA为框架通用能力,当前功能已支持,可参考[LoRA特性文档](https://gitee.com/ascend/MindSpeed-MM/blob/master/docs/features/lora_finetune.md)。 diff --git a/examples/qwen2vl/data_2b.json b/examples/qwen2vl/data_2b.json index cb0efe1f..0bd5a7ec 100644 --- a/examples/qwen2vl/data_2b.json +++ b/examples/qwen2vl/data_2b.json @@ -43,6 +43,8 @@ "collate_param": { "model_name": "qwen2vl", "ignore_pad_token_for_loss": true - } + }, + "pin_memory": true, + "num_workers": 8 } } \ No newline at end of file diff --git a/examples/qwen2vl/data_72b.json b/examples/qwen2vl/data_72b.json index 23020cb8..5be260f1 100644 --- a/examples/qwen2vl/data_72b.json +++ b/examples/qwen2vl/data_72b.json @@ -43,6 +43,8 @@ "collate_param": { "model_name": "qwen2vl", "ignore_pad_token_for_loss": true - } + }, + "pin_memory": true, + "num_workers": 8 } } \ No newline at end of file diff --git a/examples/qwen2vl/data_7b.json b/examples/qwen2vl/data_7b.json index 09cbfd5c..bdf383f6 100644 --- a/examples/qwen2vl/data_7b.json +++ b/examples/qwen2vl/data_7b.json @@ -25,7 +25,7 @@ }, "attr": { "system": null, - "images": null, + "images": "images", "videos": null, "messages": "messages", "role_tag": "role", @@ -43,6 +43,8 @@ "collate_param": { "model_name": "qwen2vl", "ignore_pad_token_for_loss": true - } + }, + "pin_memory": true, + "num_workers": 8 } } \ No newline at end of file diff --git a/examples/qwen2vl/dot_product_attention.py b/examples/qwen2vl/dot_product_attention.py index e6dc939b..b6e9d335 100644 --- a/examples/qwen2vl/dot_product_attention.py +++ b/examples/qwen2vl/dot_product_attention.py @@ -129,8 +129,8 @@ def dot_product_attention_forward( query, key, value, indices_q, cu_seq_lens, max_seq_lens = _unpad_input( query, key, value, attention_mask, seq_length ) - attention_mask_npu = torch.from_numpy( - np.triu(np.ones([max_seq_lens, max_seq_lens]), k=1)).bool().to(torch.cuda.current_device()) + attention_mask_npu = torch.triu( + torch.ones([max_seq_lens, max_seq_lens], dtype=torch.bool, device=query.device), diagonal=1) attn_output_unpad = torch_npu.npu_fusion_attention( query, key, value, n_head, pse=None, @@ -149,8 +149,8 @@ def dot_product_attention_forward( query = query.transpose(0, 1).contiguous() key = key.transpose(0, 1).contiguous() value = value.transpose(0, 1).contiguous() - attention_mask_npu = torch.from_numpy( - np.triu(np.ones([query.shape[1], key.shape[1]]), k=1)).bool().to(torch.cuda.current_device()) + attention_mask_npu = torch.triu( + torch.ones([query.shape[1], key.shape[1]], dtype=torch.bool, device=query.device), diagonal=1) attn_output = torch_npu.npu_fusion_attention( query, key, value, n_head, 'BSND', keep_prob=1.0, @@ -158,7 +158,7 @@ def dot_product_attention_forward( atten_mask=attention_mask_npu)[0] attn_output = rearrange(attn_output, 'b s h d -> s b (h d)', s=seq_length, b=bsz) return attn_output - + def _unpad_input( query_state: torch.Tensor, diff --git a/examples/qwen2vl/evaluate_qwen2vl_7b.sh b/examples/qwen2vl/evaluate_qwen2vl_7b.sh index fde5aa1c..d05ad470 100644 --- a/examples/qwen2vl/evaluate_qwen2vl_7b.sh +++ b/examples/qwen2vl/evaluate_qwen2vl_7b.sh @@ -13,7 +13,6 @@ export HCCL_CONNECT_TIMEOUT=7200 export HCCL_EXEC_TIMEOUT=7200 export NPU_ASD_ENABLE=0 export ASCEND_LAUNCH_BLOCKING=0 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True export TOKENIZERS_PARALLELISM=false diff --git a/examples/qwen2vl/finetune_qwen2vl_2b.sh b/examples/qwen2vl/finetune_qwen2vl_2b.sh index 0b561960..dc425f85 100644 --- a/examples/qwen2vl/finetune_qwen2vl_2b.sh +++ b/examples/qwen2vl/finetune_qwen2vl_2b.sh @@ -10,7 +10,6 @@ export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export NPU_ASD_ENABLE=0 export ASCEND_LAUNCH_BLOCKING=0 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 NPUS_PER_NODE=8 @@ -24,7 +23,7 @@ WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MM_DATA="./examples/qwen2vl/data_2b.json" MM_MODEL="./examples/qwen2vl/model_2b.json" MM_TOOL="./mindspeed_mm/tools/tools.json" -LOAD_PATH="ckpt/Qwen2-VL-7B-Instruct" +LOAD_PATH="ckpt/mm_path/Qwen2-VL-2B-Instruct" SAVE_PATH="save_dir" TP=1 diff --git a/examples/qwen2vl/finetune_qwen2vl_72b.sh b/examples/qwen2vl/finetune_qwen2vl_72b.sh index 99f7c759..ecbf1fe4 100644 --- a/examples/qwen2vl/finetune_qwen2vl_72b.sh +++ b/examples/qwen2vl/finetune_qwen2vl_72b.sh @@ -10,7 +10,6 @@ export CPU_AFFINITY_CONF=2 export HCCL_CONNECT_TIMEOUT=1200 export NPU_ASD_ENABLE=0 export ASCEND_LAUNCH_BLOCKING=0 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 export MULTI_STREAM_MEMORY_REUSE=2 export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True" @@ -34,7 +33,7 @@ MM_DATA="./examples/qwen2vl/data_72b.json" MM_MODEL="./examples/qwen2vl/model_72b.json" MM_TOOL="./mindspeed_mm/tools/tools.json" # 需要先根据readme把huggingface格式模型转换为mm格式 -LOAD_PATH="ckpt/Qwen2-VL-72B-Instruct" +LOAD_PATH="ckpt/mm_path/Qwen2-VL-72B-Instruct" SAVE_PATH="save_dir" TP=1 diff --git a/examples/qwen2vl/finetune_qwen2vl_7b.sh b/examples/qwen2vl/finetune_qwen2vl_7b.sh index a271be79..5435a79e 100644 --- a/examples/qwen2vl/finetune_qwen2vl_7b.sh +++ b/examples/qwen2vl/finetune_qwen2vl_7b.sh @@ -10,7 +10,6 @@ export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export NPU_ASD_ENABLE=0 export ASCEND_LAUNCH_BLOCKING=0 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 NPUS_PER_NODE=8 @@ -24,10 +23,8 @@ WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MM_DATA="./examples/qwen2vl/data_7b.json" MM_MODEL="./examples/qwen2vl/model_7b.json" MM_TOOL="./mindspeed_mm/tools/tools.json" -LOAD_PATH="./ckpt/mm_path/Qwen2-VL-7B-Instruct" -# timestamp=$(date +"%Y%m%d_%H%M%S") -# SAVE_PATH=$("save_dir/" + $timestamp) -SAVE_PATH="save_dir/" +LOAD_PATH="ckpt/mm_path/Qwen2-VL-7B-Instruct" +SAVE_PATH="save_dir" TP=1 PP=4 @@ -70,7 +67,7 @@ GPT_ARGS=" --lr 1.0e-5 \ --lr-decay-style cosine \ --weight-decay 0 \ - --train-iters 100 \ + --train-iters 10000 \ --lr-warmup-fraction 0.1 \ --clip-grad 0.0 \ --adam-beta1 0.9 \ @@ -97,9 +94,9 @@ MM_ARGS=" OUTPUT_ARGS=" --log-interval 1 \ - --save-interval 100 \ - --eval-interval 100 \ - --eval-iters 100 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 5000 \ --save $SAVE_PATH \ " logfile=$(date +%Y%m%d)_$(date +%H%M%S) diff --git a/examples/qwen2vl/inference_qwen2vl_2b.json b/examples/qwen2vl/inference_qwen2vl_2b.json new file mode 100644 index 00000000..2c239680 --- /dev/null +++ b/examples/qwen2vl/inference_qwen2vl_2b.json @@ -0,0 +1,159 @@ +{ + "pipeline_class": "Qwen2VlPipeline", + "img_context_token_id": 151655, + "image_encoder": { + "vision_encoder": { + "model_id": "qwen2vit", + "num_layers": 32, + "hidden_size": 1280, + "ffn_hidden_size": 5120, + "llm_hidden_size": 1536, + "num_attention_heads": 16, + "hidden_dropout": 0.0, + "attention_dropout": 0.0, + "in_channels": 3, + "patch_size": 14, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + "layernorm_epsilon": 1e-06, + "normalization": "LayerNorm", + "fp16": false, + "bf16": true, + "params_dtype": "bf16", + "activation_func": "quick_gelu", + "freeze": true, + "use_fused_rotary_pos_emb": true, + "post_layer_norm": false, + "pipeline_num_layers": [32] + }, + "vision_projector": { + "model_id": "lnmlp", + "num_layers": 1, + "num_attention_heads": 1, + "gated_linear_unit": false, + "bias_activation_fusion": false, + "add_bias_linear": true, + "input_size": 1280, + "hidden_size": 1536, + "ffn_hidden_size": 5120, + "activation_func": "gelu", + "bf16": true, + "params_dtype": "bf16", + "freeze": true + } + }, + "text_decoder": { + "model_id": "qwen2lm", + "num_layers": 28, + "pipeline_num_layers": [28], + "hidden_size": 1536, + "ffn_hidden_size": 8960, + "num_attention_heads": 12, + "seq_length": 1024, + "max_position_embeddings": 32768, + "vocab_size": 151936, + "rope_theta": 1000000.0, + "untie_embeddings_and_output_weights": false, + "disable_bias_linear": true, + "attention_dropout": 0.0, + "init_method_std": 0.01, + "hidden_dropout": 0.0, + "position_embedding_type": "mrope", + "normalization": "RMSNorm", + "activation_func": "silu", + "use_fused_rotary_pos_emb": true, + "attention_softmax_in_fp32": true, + "params_dtype": "bf16", + "bf16": true, + "parallel_output": true, + "group_query_attention": true, + "num_query_groups": 2, + "mrope_section": [16, 24, 24], + "rope_scaling": null, + "gated_linear_unit": true, + "layernorm_epsilon": 1e-06, + "add_bias_linear":false, + "add_qkv_bias": true, + "sequence_parallel": false, + "tokenizer_type": "PretrainedFromHF", + "is_encoder_decoder": false + }, + "text_encoder": null, + "video_encoder": null, + "dtype": "bf16", + "device": "npu", + "tokenizer": { + "hub_backend": "hf", + "autotokenizer_name": "AutoTokenizer", + "from_pretrained": "ckpt/hf_path/Qwen2-VL-2B-Instruct", + "local_files_only":false + }, + "generation_config": { + "bos_token_id": 151643, + "do_sample": true, + "output_attentions": false, + "output_hidden_states": false, + "max_length": 20, + "min_length": 0, + "min_new_tokens": null, + "constraints": null, + "prompt_lookup_num_tokens": null, + "guidance_scale": null, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "diversity_penalty": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "encoder_repetition_penalty": 1.0, + "epsilon_cutoff": 0.0, + "eta_cutoff": 0.0, + "exponential_decay_length_penalty": null, + "forced_bos_token_id": null, + "forced_decoder_ids": null, + "forced_eos_token_id": null, + "length_penalty": 1.0, + "low_memory": null, + "max_time": null, + "no_repeat_ngram_size": 0, + "num_assistant_tokens": 5, + "num_assistant_tokens_schedule": "heuristic", + "num_beam_groups": 1, + "num_return_groups": 1, + "num_return_sequences": 1, + "output_scores": false, + "output_logits": null, + "penalty_alpha": null, + "remove_invalid_values": false, + "repetition_penalty": 1.0, + "return_dict_in_generate": false, + "sequence_bias": null, + "suppress_tokens": null, + "typical_p": 1.0, + "force_words_ids": null, + "num_beams": 1, + "renormalize_logits": false, + "use_cache": true, + "eos_token_id": [ + 151645, + 151643 + ], + "max_new_tokens": 256, + "pad_token_id": 151643, + "temperature": 0.01, + "top_k": 1, + "top_p": 0.001, + "dola_layers": null, + "cache_implementation": null, + "cache_config": null, + "return_legacy_cache": null, + "min_p": null, + "token_healing": false, + "watermarking_config": null, + "decoder_start_token_id": null, + "max_matching_ngram_size": null, + "stop_strings": null + }, + "image_processer_path": "ckpt/hf_path/Qwen2-VL-2B-Instruct/preprocessor_config.json", + "image_path": "examples/qwen2vl/demo.jpeg", + "prompts": "Describe this image and keep it within 100 words." +} diff --git a/examples/qwen2vl/inference_qwen2vl_2b.sh b/examples/qwen2vl/inference_qwen2vl_2b.sh new file mode 100644 index 00000000..31b90a02 --- /dev/null +++ b/examples/qwen2vl/inference_qwen2vl_2b.sh @@ -0,0 +1,87 @@ +#!/bin/bash +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=3 +export TASK_QUEUE_ENABLE=2 +export COMBINED_ENABLE=1 +export CPU_AFFINITY_CONF=1 +export HCCL_CONNECT_TIMEOUT=1200 +export NPU_ASD_ENABLE=0 +export ACLNN_CACHE_LIMIT=100000 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +NPUS_PER_NODE=1 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +MM_MODEL="./examples/qwen2vl/inference_qwen2vl_2b.json" +LOAD_PATH="ckpt/mm_path/Qwen2-VL-2B-Instruct" + +TP=1 +PP=1 +CP=1 +SEQ_LEN=1024 +MBS=1 +GRAD_ACC_STEP=1 +DP=$(($WORLD_SIZE/$TP/$PP/$CP)) +GBS=$(($MBS*$GRAD_ACC_STEP*$DP)) + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +GPT_ARGS=" + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --num-layers 1 \ + --hidden-size 1 \ + --ffn-hidden-size 1 \ + --num-attention-heads 1 \ + --tokenizer-type NullTokenizer \ + --vocab-size 1 \ + --seq-length 1 \ + --max-position-embeddings 1 \ + --make-vocab-size-divisible-by 1 \ + --init-method-std 0.01 \ + --normalization RMSNorm \ + --use-fused-rmsnorm \ + --swiglu \ + --use-fused-swiglu \ + --seed 42 \ + --bf16 \ + --load $LOAD_PATH \ + --variable-seq-lengths \ + --enable-one-logger \ + --use-flash-attn \ + --no-load-optim \ + --no-load-rng +" + +MM_ARGS=" + --mm-model $MM_MODEL +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 5000 \ +" + +torchrun $DISTRIBUTED_ARGS inference_vlm.py \ + $GPT_ARGS \ + $MM_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl diff --git a/examples/qwen2vl/inference_qwen2vl_72b.json b/examples/qwen2vl/inference_qwen2vl_72b.json new file mode 100644 index 00000000..66400587 --- /dev/null +++ b/examples/qwen2vl/inference_qwen2vl_72b.json @@ -0,0 +1,158 @@ +{ + "pipeline_class": "Qwen2VlPipeline", + "img_context_token_id": 151655, + "image_encoder": { + "vision_encoder": { + "model_id": "qwen2vit", + "num_layers": 32, + "hidden_size": 1280, + "ffn_hidden_size": 5120, + "llm_hidden_size": 8192, + "num_attention_heads": 16, + "hidden_dropout": 0.0, + "attention_dropout": 0.0, + "in_channels": 3, + "patch_size": 14, + "spatial_merge_size": 2, + "temporal_patch_size": 2, + "layernorm_epsilon": 1e-06, + "normalization": "LayerNorm", + "fp16": false, + "bf16": true, + "params_dtype": "bf16", + "activation_func": "quick_gelu", + "freeze": true, + "use_fused_rotary_pos_emb": true, + "post_layer_norm": false, + "pipeline_num_layers": [32, 0, 0, 0, 0, 0, 0, 0] + }, + "vision_projector": { + "model_id": "lnmlp", + "num_layers": 1, + "num_attention_heads": 1, + "gated_linear_unit": false, + "bias_activation_fusion": false, + "add_bias_linear": true, + "input_size": 1280, + "hidden_size": 8192, + "ffn_hidden_size": 5120, + "activation_func": "gelu", + "bf16": true, + "params_dtype": "bf16", + "freeze": true + } + }, + "text_decoder": { + "model_id": "qwen2lm", + "num_layers": 80, + "pipeline_num_layers": [8, 10, 10, 10, 10, 12, 12, 8], + "hidden_size": 8192, + "ffn_hidden_size": 29568, + "num_attention_heads": 64, + "max_position_embeddings": 32768, + "vocab_size": 152064, + "rope_theta": 1000000.0, + "untie_embeddings_and_output_weights": true, + "disable_bias_linear": true, + "attention_dropout": 0.0, + "init_method_std": 0.01, + "hidden_dropout": 0.0, + "position_embedding_type": "mrope", + "normalization": "RMSNorm", + "activation_func": "silu", + "use_fused_rotary_pos_emb": true, + "attention_softmax_in_fp32": true, + "params_dtype": "bf16", + "bf16": true, + "parallel_output": true, + "group_query_attention": true, + "num_query_groups": 8, + "mrope_section": [16, 24, 24], + "rope_scaling": null, + "gated_linear_unit": true, + "layernorm_epsilon": 1e-06, + "add_bias_linear":false, + "add_qkv_bias": true, + "sequence_parallel": false, + "tokenizer_type": "PretrainedFromHF", + "is_encoder_decoder": false + }, + "text_encoder": null, + "video_encoder": null, + "dtype": "bf16", + "device": "npu", + "tokenizer": { + "hub_backend": "hf", + "autotokenizer_name": "AutoTokenizer", + "from_pretrained": "ckpt/hf_path/Qwen2-VL-72B-Instruct", + "local_files_only":false + }, + "generation_config": { + "bos_token_id": 151643, + "do_sample": true, + "output_attentions": false, + "output_hidden_states": false, + "max_length": 20, + "min_length": 0, + "min_new_tokens": null, + "constraints": null, + "prompt_lookup_num_tokens": null, + "guidance_scale": null, + "bad_words_ids": null, + "begin_suppress_tokens": null, + "diversity_penalty": 0.0, + "early_stopping": false, + "encoder_no_repeat_ngram_size": 0, + "encoder_repetition_penalty": 1.0, + "epsilon_cutoff": 0.0, + "eta_cutoff": 0.0, + "exponential_decay_length_penalty": null, + "forced_bos_token_id": null, + "forced_decoder_ids": null, + "forced_eos_token_id": null, + "length_penalty": 1.0, + "low_memory": null, + "max_time": null, + "no_repeat_ngram_size": 0, + "num_assistant_tokens": 5, + "num_assistant_tokens_schedule": "heuristic", + "num_beam_groups": 1, + "num_return_groups": 1, + "num_return_sequences": 1, + "output_scores": false, + "output_logits": null, + "penalty_alpha": null, + "remove_invalid_values": false, + "repetition_penalty": 1.05, + "return_dict_in_generate": false, + "sequence_bias": null, + "suppress_tokens": null, + "typical_p": 1.0, + "force_words_ids": null, + "num_beams": 1, + "renormalize_logits": false, + "use_cache": true, + "eos_token_id": [ + 151645, + 151643 + ], + "max_new_tokens": 256, + "pad_token_id": 151643, + "temperature": 0.01, + "top_k": 1, + "top_p": 0.001, + "dola_layers": null, + "cache_implementation": null, + "cache_config": null, + "return_legacy_cache": null, + "min_p": null, + "token_healing": false, + "watermarking_config": null, + "decoder_start_token_id": null, + "max_matching_ngram_size": null, + "stop_strings": null + }, + "image_processer_path": "ckpt/hf_path/Qwen2-VL-72B-Instruct/preprocessor_config.json", + "image_path": "examples/qwen2vl/demo.jpeg", + "prompts": "Describe this image and keep it within 100 words." +} diff --git a/examples/qwen2vl/inference_qwen2vl_72b.sh b/examples/qwen2vl/inference_qwen2vl_72b.sh new file mode 100644 index 00000000..db19254a --- /dev/null +++ b/examples/qwen2vl/inference_qwen2vl_72b.sh @@ -0,0 +1,86 @@ +#!/bin/bash +source /usr/local/Ascend/ascend-toolkit/set_env.sh + +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export ASCEND_SLOG_PRINT_TO_STDOUT=0 +export ASCEND_GLOBAL_LOG_LEVEL=3 +export TASK_QUEUE_ENABLE=2 +export COMBINED_ENABLE=1 +export CPU_AFFINITY_CONF=1 +export HCCL_CONNECT_TIMEOUT=1200 +export NPU_ASD_ENABLE=0 +export ACLNN_CACHE_LIMIT=100000 +export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True + +NPUS_PER_NODE=8 +MASTER_ADDR=localhost +MASTER_PORT=6000 +NNODES=1 +NODE_RANK=0 +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) + +MM_MODEL="./examples/qwen2vl/inference_qwen2vl_72b.json" +LOAD_PATH="ckpt/mm_path/Qwen2-VL-72B-Instruct" + +TP=1 +PP=8 +CP=1 +MBS=1 +GRAD_ACC_STEP=1 +DP=$(($WORLD_SIZE/$TP/$PP/$CP)) +GBS=$(($MBS*$GRAD_ACC_STEP*$DP)) + +DISTRIBUTED_ARGS=" + --nproc_per_node $NPUS_PER_NODE \ + --nnodes $NNODES \ + --node_rank $NODE_RANK \ + --master_addr $MASTER_ADDR \ + --master_port $MASTER_PORT +" + +GPT_ARGS=" + --use-mcore-models \ + --tensor-model-parallel-size ${TP} \ + --pipeline-model-parallel-size ${PP} \ + --micro-batch-size ${MBS} \ + --global-batch-size ${GBS} \ + --num-layers 1 \ + --hidden-size 1 \ + --ffn-hidden-size 1 \ + --num-attention-heads 1 \ + --tokenizer-type NullTokenizer \ + --vocab-size 1 \ + --seq-length 1 \ + --max-position-embeddings 1 \ + --make-vocab-size-divisible-by 1 \ + --init-method-std 0.01 \ + --normalization RMSNorm \ + --use-fused-rmsnorm \ + --swiglu \ + --use-fused-swiglu \ + --seed 42 \ + --bf16 \ + --load $LOAD_PATH \ + --variable-seq-lengths \ + --enable-one-logger \ + --use-flash-attn \ + --no-load-optim \ + --no-load-rng +" + +MM_ARGS=" + --mm-model $MM_MODEL +" + +OUTPUT_ARGS=" + --log-interval 1 \ + --save-interval 10000 \ + --eval-interval 10000 \ + --eval-iters 5000 \ +" + +torchrun $DISTRIBUTED_ARGS inference_vlm.py \ + $GPT_ARGS \ + $MM_ARGS \ + $OUTPUT_ARGS \ + --distributed-backend nccl diff --git a/examples/qwen2vl/inference_qwen2vl_7b.json b/examples/qwen2vl/inference_qwen2vl_7b.json index a0120a93..b0126f4f 100644 --- a/examples/qwen2vl/inference_qwen2vl_7b.json +++ b/examples/qwen2vl/inference_qwen2vl_7b.json @@ -154,7 +154,6 @@ "stop_strings": null }, "image_processer_path": "ckpt/hf_path/Qwen2-VL-7B-Instruct/preprocessor_config.json", - "image_path": "", - "prompts": "Describe this image and keep it within 100 words.", - "temperature": 0 + "image_path": "examples/qwen2vl/demo.jpeg", + "prompts": "Describe this image and keep it within 100 words." } diff --git a/examples/qwen2vl/inference_qwen2vl_7b.sh b/examples/qwen2vl/inference_qwen2vl_7b.sh index f5b64867..f508a4ff 100644 --- a/examples/qwen2vl/inference_qwen2vl_7b.sh +++ b/examples/qwen2vl/inference_qwen2vl_7b.sh @@ -1,8 +1,6 @@ #!/bin/bash source /usr/local/Ascend/ascend-toolkit/set_env.sh -# 通过此配置选择使用的NPU卡 -# export ASCEND_RT_VISIBLE_DEVICES=1 export CUDA_DEVICE_MAX_CONNECTIONS=1 export ASCEND_SLOG_PRINT_TO_STDOUT=0 export ASCEND_GLOBAL_LOG_LEVEL=3 @@ -11,12 +9,10 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export NPU_ASD_ENABLE=0 -export ASCEND_LAUNCH_BLOCKING=0 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True -NPUS_PER_NODE=4 +NPUS_PER_NODE=1 MASTER_ADDR=localhost MASTER_PORT=6000 NNODES=1 @@ -24,10 +20,10 @@ NODE_RANK=0 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MM_MODEL="./examples/qwen2vl/inference_qwen2vl_7b.json" -LOAD_PATH="/home/ma-user/work/MindSpeed-MM/save_dir/" +LOAD_PATH="ckpt/mm_path/Qwen2-VL-7B-Instruct" TP=1 -PP=4 +PP=1 CP=1 SEQ_LEN=1024 MBS=1 diff --git a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py index 8c3b2d7b..bc30de4b 100644 --- a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py +++ b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py @@ -2,9 +2,7 @@ import json import os import stat -# llava_json_path = "./data/llava_instruct_150k_wo_img.json" -# llava_json_path = "./data/ip.json" -llava_json_path = "./data/full_data.json" +llava_json_path = "./data/llava_instruct_150k.json" mllm_format_json_path = "./data/mllm_format_llava_instruct_data.json" with open(llava_json_path, "r") as f: @@ -12,16 +10,15 @@ with open(llava_json_path, "r") as f: mllm_format_llava_instruct_data = [] for item in info_json: - # img_path = os.path.join("./data/COCO2017/train2017", item["image"]) - # img_path = os.path.join("./data/dummy", item["image"]) - img_path = os.path.join("./data", item["image"]) - if not os.path.exists(img_path): - continue - if not img_path.endswith(".jpg") and not img_path.endswith(".png"): + if item.get('image', None): new_item = { "images": [], "messages": [] } + img_path = os.path.join("./data/COCO2017/train2017", item["image"]) + print(f"img_path: {img_path}") + if not os.path.exists(img_path): + continue else: new_item = { "images": [img_path], @@ -39,7 +36,7 @@ for item in info_json: output_json = json.dumps(mllm_format_llava_instruct_data) if os.path.exists(mllm_format_json_path): - os.remove(mllm_format_json_path) + print(f"{mllm_format_json_path} already exists, please rename it or remove it") with os.fdopen(os.open(mllm_format_json_path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, stat.S_IWUSR | stat.S_IRUSR), "w") as f: f.write(output_json) print(f"finish converting dataset into {mllm_format_json_path}") diff --git a/examples/qwen2vl/model_2b.json b/examples/qwen2vl/model_2b.json index e855d404..2433a4dd 100644 --- a/examples/qwen2vl/model_2b.json +++ b/examples/qwen2vl/model_2b.json @@ -24,7 +24,7 @@ "freeze": true, "use_fused_rotary_pos_emb": true, "post_layer_norm": false, - "pipeline_num_layers": [32, 0, 0, 0] + "pipeline_num_layers": [32] }, "vision_projector": { "model_id": "lnmlp", @@ -45,7 +45,7 @@ "text_decoder": { "model_id": "qwen2lm", "num_layers": 28, - "pipeline_num_layers": [1, 6, 11, 10], + "pipeline_num_layers": [28], "hidden_size": 1536, "ffn_hidden_size": 8960, "num_attention_heads": 12, diff --git a/examples/qwen2vl/qwen2vl_convert_pp_to_pp.py b/examples/qwen2vl/qwen2vl_convert_pp_to_pp.py new file mode 100644 index 00000000..94412980 --- /dev/null +++ b/examples/qwen2vl/qwen2vl_convert_pp_to_pp.py @@ -0,0 +1,33 @@ +from qwen2vl_convert_to_hf import load_from_mm, check_pp_config +from qwen2vl_convert_to_mm_ckpt import split_model_by_pipeline, save_by_pp, merge_pp_index + +if __name__ == "__main__": + mm_save_dir = "save_dir" # 微调后保存的权重目录 + new_save_dir = "new_pp_save_dir" # 希望重新pp切分后保存的目录 + + vit_num_layers = 32 + llm_num_layers = 28 + + old_pp_size = 4 + old_vit_pipeline_num_layers = [32, 0, 0, 0] + old_llm_pipeline_num_layers = [1, 6, 11, 10] + + new_pp_size = 2 + new_vit_pipeline_num_layers = [32, 0] + new_llm_pipeline_num_layers = [14, 14] + + check_pp_config(old_pp_size, vit_num_layers, old_vit_pipeline_num_layers, llm_num_layers, + old_llm_pipeline_num_layers) + check_pp_config(new_pp_size, vit_num_layers, new_vit_pipeline_num_layers, llm_num_layers, + new_llm_pipeline_num_layers) + state_dict = load_from_mm(mm_save_dir, old_vit_pipeline_num_layers, old_llm_pipeline_num_layers) + pp_split = merge_pp_index(new_pp_size, vit_num_layers, new_vit_pipeline_num_layers, llm_num_layers, + new_llm_pipeline_num_layers) + state_dicts, _ = split_model_by_pipeline(state_dict, pp_split) + + for rank, pipeline_state_dict in enumerate(state_dicts): + print(20 * '#', f'stage {rank}', 20 * '#') + for key, value in pipeline_state_dict.items(): + if value is not None: + print(key, value.shape) + save_by_pp(state_dicts, new_save_dir, _exists_ok=True) diff --git a/examples/qwen2vl/qwen2vl_convert_to_hf.py b/examples/qwen2vl/qwen2vl_convert_to_hf.py index 55bace3c..29a17f49 100644 --- a/examples/qwen2vl/qwen2vl_convert_to_hf.py +++ b/examples/qwen2vl/qwen2vl_convert_to_hf.py @@ -7,6 +7,39 @@ import mindspeed.megatron_adaptor # noqa import torch from safetensors.torch import save_file +MODEL_CONFIG_DICT = { + '2B': { + 'model_size': '2B', + 'vit_hidden_size': 1280, + 'vit_num_attention_heads': 16, + 'vit_num_layers': 32, + 'llm_hidden_size': 1536, + 'llm_num_query_groups': 2, + 'llm_num_attention_heads': 12, + 'llm_num_layers': 28, + }, + '7B': { + 'model_size': '7B', + 'vit_hidden_size': 1280, + 'vit_num_attention_heads': 16, + 'vit_num_layers': 32, + 'llm_hidden_size': 3584, + 'llm_num_query_groups': 4, + 'llm_num_attention_heads': 28, + 'llm_num_layers': 28, + }, + '72B': { + 'model_size': '72B', + 'vit_hidden_size': 1280, + 'vit_num_attention_heads': 16, + 'vit_num_layers': 32, + 'llm_hidden_size': 8192, + 'llm_num_query_groups': 8, + 'llm_num_attention_heads': 64, + 'llm_num_layers': 80, + } +} + def rename_pp_parameter(param_name: str, model_dir: Path, vit_pp_list: list[int], llm_pp_list: list[int]) -> str: index = int(model_dir.parent.stem.split('_')[-1]) @@ -29,14 +62,14 @@ def rename_pp_parameter(param_name: str, model_dir: Path, vit_pp_list: list[int] return param_name -def load_from_mm(_load_dir, vit_pp_list, llm_pp_list): +def load_from_mm(_load_dir: str, vit_pp_list: list[int], llm_pp_list: list[int]) -> dict: LATEST_TXT = "latest_checkpointed_iteration.txt" mm_save_dir = Path(_load_dir) save_iteration = mm_save_dir.joinpath(LATEST_TXT).read_text() - save_iter_dir = mm_save_dir.joinpath(f"iter_{int(save_iteration):07}") + save_dir = mm_save_dir.joinpath(f"iter_{int(save_iteration):07}" if save_iteration != "release" else save_iteration) state_dict = {} - print(str(save_iter_dir).center(100, "=")) - for pt_path in save_iter_dir.glob("*/*.pt"): + print(str(save_dir).center(100, "=")) + for pt_path in save_dir.glob("*/*.pt"): print(str(pt_path).center(100, '_')) state_dict.update( {rename_pp_parameter(param, pt_path, vit_pp_list, llm_pp_list): tensor @@ -47,10 +80,23 @@ def load_from_mm(_load_dir, vit_pp_list, llm_pp_list): return state_dict -def convert_mm_to_hf(_state_dict, _vit_hidden_size, _vit_attention_heads_num): - hiddensize_per_head = _vit_hidden_size // _vit_attention_heads_num +def convert_mm_to_hf(_state_dict: dict, _model_config: dict) -> dict: + vit_hidden_size = _model_config['vit_hidden_size'] + vit_num_attention_heads = _model_config['vit_num_attention_heads'] + llm_hidden_size = _model_config['llm_hidden_size'] + llm_num_attention_heads = _model_config['llm_num_attention_heads'] + llm_num_query_groups = _model_config['llm_num_query_groups'] + + vit_head_hidden_size = vit_hidden_size // vit_num_attention_heads + llm_head_hidden_size = llm_hidden_size // llm_num_attention_heads + q_size = llm_head_hidden_size * llm_num_attention_heads // llm_num_query_groups + k_size = llm_head_hidden_size * llm_num_query_groups // llm_num_query_groups + v_size = llm_head_hidden_size * llm_num_query_groups // llm_num_query_groups + new_params = {} for key, value in _state_dict.items(): + if value is None: + continue new_key = None # image_encoder 权重转换部分 if key.startswith('image_encoder'): @@ -69,16 +115,16 @@ def convert_mm_to_hf(_state_dict, _vit_hidden_size, _vit_attention_heads_num): if 'qkv.weight' in new_key: res = value * 0 i = 0 - for j in range(_vit_attention_heads_num): - q_part = value[i * hiddensize_per_head: (i + 1) * hiddensize_per_head, :] - res[hiddensize_per_head * j: hiddensize_per_head * (j + 1), :] = q_part + for j in range(vit_num_attention_heads): + q_part = value[i * vit_head_hidden_size: (i + 1) * vit_head_hidden_size, :] + res[vit_head_hidden_size * j: vit_head_hidden_size * (j + 1), :] = q_part - k_part = value[(i + 1) * hiddensize_per_head: (i + 2) * hiddensize_per_head, :] - res[_vit_hidden_size + hiddensize_per_head * j: _vit_hidden_size + hiddensize_per_head * (j + 1), + k_part = value[(i + 1) * vit_head_hidden_size: (i + 2) * vit_head_hidden_size, :] + res[vit_hidden_size + vit_head_hidden_size * j: vit_hidden_size + vit_head_hidden_size * (j + 1), :] = k_part - v_part = value[(i + 2) * hiddensize_per_head: (i + 3) * hiddensize_per_head, :] - res[_vit_hidden_size * 2 + hiddensize_per_head * j: _vit_hidden_size * 2 + hiddensize_per_head * ( + v_part = value[(i + 2) * vit_head_hidden_size: (i + 3) * vit_head_hidden_size, :] + res[vit_hidden_size * 2 + vit_head_hidden_size * j: vit_hidden_size * 2 + vit_head_hidden_size * ( j + 1), :] = v_part i = i + 3 @@ -86,17 +132,17 @@ def convert_mm_to_hf(_state_dict, _vit_hidden_size, _vit_attention_heads_num): elif 'qkv.bias' in new_key: res = value * 0 i = 0 - for j in range(_vit_attention_heads_num): - q_part = value[i * hiddensize_per_head: (i + 1) * hiddensize_per_head] - res[hiddensize_per_head * j: hiddensize_per_head * (j + 1)] = q_part + for j in range(vit_num_attention_heads): + q_part = value[i * vit_head_hidden_size: (i + 1) * vit_head_hidden_size] + res[vit_head_hidden_size * j: vit_head_hidden_size * (j + 1)] = q_part - k_part = value[(i + 1) * hiddensize_per_head: (i + 2) * hiddensize_per_head] - res[_vit_hidden_size + hiddensize_per_head * j: _vit_hidden_size + hiddensize_per_head * ( + k_part = value[(i + 1) * vit_head_hidden_size: (i + 2) * vit_head_hidden_size] + res[vit_hidden_size + vit_head_hidden_size * j: vit_hidden_size + vit_head_hidden_size * ( j + 1)] = k_part - v_part = value[(i + 2) * hiddensize_per_head: (i + 3) * hiddensize_per_head] + v_part = value[(i + 2) * vit_head_hidden_size: (i + 3) * vit_head_hidden_size] res[ - _vit_hidden_size * 2 + hiddensize_per_head * j: _vit_hidden_size * 2 + hiddensize_per_head * ( + vit_hidden_size * 2 + vit_head_hidden_size * j: vit_hidden_size * 2 + vit_head_hidden_size * ( j + 1)] = v_part i = i + 3 @@ -106,55 +152,27 @@ def convert_mm_to_hf(_state_dict, _vit_hidden_size, _vit_attention_heads_num): new_params[new_key] = value else: - if 'self_attention.linear_qkv.weight' in key: - qkv_chunks = torch.chunk(value, 4, dim=0) - # qkv的结构是[896(q)+128(k)+128(v)]*4 - indices = [896, 1024] - indices = [0] + indices + [qkv_chunks[0].size(0)] - q_chunks = [] - k_chunks = [] - v_chunks = [] - for j in range(4): - splits = [qkv_chunks[j][indices[i]:indices[i + 1]] for i in range(len(indices) - 1)] - q_chunks.append(splits[0]) - k_chunks.append(splits[1]) - v_chunks.append(splits[2]) - - attention_q_weight = torch.cat(q_chunks, dim=0) - attention_k_weight = torch.cat(k_chunks, dim=0) - attention_v_weight = torch.cat(v_chunks, dim=0) - - layer = key.split('.')[3] - attention_q = f'model.layers.{layer}.self_attn.q_proj.weight' - attention_k = f'model.layers.{layer}.self_attn.k_proj.weight' - attention_v = f'model.layers.{layer}.self_attn.v_proj.weight' - - new_params[attention_q] = attention_q_weight - new_params[attention_k] = attention_k_weight - new_params[attention_v] = attention_v_weight - - elif 'self_attention.linear_qkv.bias' in key: - qkv_chunks = torch.chunk(value, 4, dim=0) - # qkv的结构是[896(q)+128(k)+128(v)]*4 - indices = [896, 1024] - indices = [0] + indices + [qkv_chunks[0].size(0)] + # self_attention.linear_qkv.weight 和 self_attention.linear_qkv.bias + if 'self_attention.linear_qkv' in key: + qkv_chunks = torch.chunk(value, llm_num_query_groups, dim=0) q_chunks = [] k_chunks = [] v_chunks = [] - for j in range(4): - splits = [qkv_chunks[j][indices[i]:indices[i + 1]] for i in range(len(indices) - 1)] - q_chunks.append(splits[0]) - k_chunks.append(splits[1]) - v_chunks.append(splits[2]) + for chunk in qkv_chunks: + q_chunk, k_chunk, v_chunk = torch.split(chunk, [q_size, k_size, v_size], dim=0) + q_chunks.append(q_chunk) + k_chunks.append(k_chunk) + v_chunks.append(v_chunk) attention_q_weight = torch.cat(q_chunks, dim=0) attention_k_weight = torch.cat(k_chunks, dim=0) attention_v_weight = torch.cat(v_chunks, dim=0) layer = key.split('.')[3] - attention_q = f'model.layers.{layer}.self_attn.q_proj.bias' - attention_k = f'model.layers.{layer}.self_attn.k_proj.bias' - attention_v = f'model.layers.{layer}.self_attn.v_proj.bias' + name = key.split('.')[-1] # weight或bias + attention_q = f'model.layers.{layer}.self_attn.q_proj.{name}' + attention_k = f'model.layers.{layer}.self_attn.k_proj.{name}' + attention_v = f'model.layers.{layer}.self_attn.v_proj.{name}' new_params[attention_q] = attention_q_weight new_params[attention_k] = attention_k_weight @@ -212,7 +230,7 @@ def copy_except_safetensors(src_dir: str, dst_dir: str) -> None: shutil.copy2(src_file, dst_file) -def check_pp_config(_pp_size, _vit_num_layers, _vit_pipeline_num_layers, _llm_num_layers, _llm_pipeline_num_layers): +def check_pp_config(_pp_size: int, _vit_num_layers: int, _vit_pipeline_num_layers: list[int], _llm_num_layers: int, _llm_pipeline_num_layers: list[int]) -> None: if len(_vit_pipeline_num_layers) != _pp_size: raise AssertionError(f'length of vit_pipeline_num_layers must be equal to pp_size, ' f'but got {len(_vit_pipeline_num_layers)} and {_pp_size}.') @@ -227,7 +245,7 @@ def check_pp_config(_pp_size, _vit_num_layers, _vit_pipeline_num_layers, _llm_nu f'but got {sum(_llm_pipeline_num_layers)} and {_llm_num_layers}.') -def split_by_index_json(_state_dict, _model_path): +def split_by_index_json(_state_dict: dict, _model_path: str) -> list[dict]: index_json_path = os.path.join(_model_path, 'model.safetensors.index.json') return_dicts = [] with open(index_json_path, 'r', encoding='utf-8') as file: @@ -240,7 +258,7 @@ def split_by_index_json(_state_dict, _model_path): return return_dicts -def save_by_index_json(_state_dicts, _save_dir): +def save_by_index_json(_state_dicts: list[dict], _save_dir: str) -> None: metadata = { 'format': 'pt' } @@ -250,22 +268,21 @@ def save_by_index_json(_state_dicts, _save_dir): if __name__ == "__main__": - mm_save_dir = "save_dir" # 微调后保存的权重目录 - hg_save_dir = "Qwen2-VL-7B-Save" # 希望保存的hf目录 - model_path = "Qwen2-VL-7B-Instruct" # hf原仓目录 - + mm_save_dir = "save_dir" # 微调后保存的权重目录 + hf_save_dir = "Qwen2-VL-7B-Save" # 希望保存的hf目录 + model_path = "ckpt/hf_path/Qwen2-VL-7B-Instruct" # hf原仓目录 + model_size = "7B" # 根据需要转换的模型,指定配置( 2B 7B 72B ) + #model parameters + model_config = MODEL_CONFIG_DICT[model_size] + + #PP parameters: 7B pp_size = 4 - vit_num_layers = 32 vit_pipeline_num_layers = [32, 0, 0, 0] - llm_num_layers = 28 llm_pipeline_num_layers = [1, 6, 11, 10] - vit_hidden_size = 1280 - vit_attention_heads_num = 16 - - check_pp_config(pp_size, vit_num_layers, vit_pipeline_num_layers, llm_num_layers, llm_pipeline_num_layers) + check_pp_config(pp_size, model_config["vit_num_layers"], vit_pipeline_num_layers, model_config["llm_num_layers"], llm_pipeline_num_layers) state_dict = load_from_mm(mm_save_dir, vit_pipeline_num_layers, llm_pipeline_num_layers) - state_dict = convert_mm_to_hf(state_dict, vit_hidden_size, vit_attention_heads_num) + state_dict = convert_mm_to_hf(state_dict, model_config) state_dicts = split_by_index_json(state_dict, model_path) - copy_except_safetensors(model_path, hg_save_dir) - save_by_index_json(state_dicts, hg_save_dir) + copy_except_safetensors(model_path, hf_save_dir) + save_by_index_json(state_dicts, hf_save_dir) diff --git a/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py b/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py index b85e6c5f..e5f6702e 100644 --- a/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py +++ b/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py @@ -6,17 +6,53 @@ from copy import deepcopy import torch from safetensors.torch import load_file +MODEL_CONFIG_DICT = { + '2B': { + 'model_size': '2B', + 'vit_hidden_size': 1280, + 'vit_num_attention_heads': 16, + 'vit_num_layers': 32, + 'llm_hidden_size': 1536, + 'llm_num_query_groups': 2, + 'llm_num_attention_heads': 12, + 'llm_num_layers': 28, + }, + '7B': { + 'model_size': '7B', + 'vit_hidden_size': 1280, + 'vit_num_attention_heads': 16, + 'vit_num_layers': 32, + 'llm_hidden_size': 3584, + 'llm_num_query_groups': 4, + 'llm_num_attention_heads': 28, + 'llm_num_layers': 28, + }, + '72B': { + 'model_size': '72B', + 'vit_hidden_size': 1280, + 'vit_num_attention_heads': 16, + 'vit_num_layers': 32, + 'llm_hidden_size': 8192, + 'llm_num_query_groups': 8, + 'llm_num_attention_heads': 64, + 'llm_num_layers': 80, + } +} + def load_from_hf(_load_dir): # Load Huggingface model 。 load_dir = Path(_load_dir) + safetensors_files = list(load_dir.glob("*.safetensors")) + if not safetensors_files: + raise FileNotFoundError(f"No *.safetensors files found in {load_dir}") state_dict = {} - for safe_path in load_dir.glob("*.safetensors"): + for safe_path in safetensors_files: state_dict.update(load_file(str(safe_path), device='cpu')) return state_dict -def convert_hg_to_mm(_state_dict, _num_layers, _vit_hidden_size, _vit_attention_heads_num): +def convert_hf_to_mm(_state_dict, _num_layers, _vit_hidden_size, _vit_attention_heads_num, _llm_num_query_groups): hiddensize_per_head = _vit_hidden_size // _vit_attention_heads_num new_params = {} for key, value in _state_dict.items(): @@ -132,11 +168,11 @@ def convert_hg_to_mm(_state_dict, _num_layers, _vit_hidden_size, _vit_attention_ if attention_v in new_params.keys(): attention_v_weight = new_params[attention_v] - q_chunks = torch.chunk(attention_q_weight, 4, dim=0) - k_chunks = torch.chunk(attention_k_weight, 4, dim=0) - v_chunks = torch.chunk(attention_v_weight, 4, dim=0) + q_chunks = torch.chunk(attention_q_weight, _llm_num_query_groups, dim=0) + k_chunks = torch.chunk(attention_k_weight, _llm_num_query_groups, dim=0) + v_chunks = torch.chunk(attention_v_weight, _llm_num_query_groups, dim=0) all_chunks = [] - for j in range(4): + for j in range(_llm_num_query_groups): all_chunks.append(q_chunks[j]) all_chunks.append(k_chunks[j]) all_chunks.append(v_chunks[j]) @@ -168,11 +204,11 @@ def convert_hg_to_mm(_state_dict, _num_layers, _vit_hidden_size, _vit_attention_ else: continue - q_chunks1 = torch.chunk(attention_q_bias, 4, dim=0) - k_chunks1 = torch.chunk(attention_k_bias, 4, dim=0) - v_chunks1 = torch.chunk(attention_v_bias, 4, dim=0) + q_chunks1 = torch.chunk(attention_q_bias, _llm_num_query_groups, dim=0) + k_chunks1 = torch.chunk(attention_k_bias, _llm_num_query_groups, dim=0) + v_chunks1 = torch.chunk(attention_v_bias, _llm_num_query_groups, dim=0) all_chunks1 = [] - for j in range(4): + for j in range(_llm_num_query_groups): all_chunks1.append(q_chunks1[j]) all_chunks1.append(k_chunks1[j]) all_chunks1.append(v_chunks1[j]) @@ -187,6 +223,7 @@ def convert_hg_to_mm(_state_dict, _num_layers, _vit_hidden_size, _vit_attention_ return new_params + def merge_pp_index(pp_size, vit_num_layers, vit_pipeline_num_layers, llm_num_layers, llm_pipeline_num_layers): if len(vit_pipeline_num_layers) != pp_size: raise AssertionError(f'length of vit_pipeline_num_layers must be equal to pp_size, ' @@ -205,10 +242,11 @@ def merge_pp_index(pp_size, vit_num_layers, vit_pipeline_num_layers, llm_num_lay split_method.append((vit_num, llm_num)) return split_method + def split_model_by_pipeline(state_dict, pp_split): if pp_split is None or len(pp_split) <= 1: return [state_dict], {} - + pp_size = len(pp_split) vit_range = [0, 0] llm_range = [pp_size - 1, pp_size - 1] @@ -219,7 +257,7 @@ def split_model_by_pipeline(state_dict, pp_split): llm_range[0] = pp_rank print(f'vit range: {vit_range[0]}~{vit_range[1]}') print(f'llm range: {llm_range[0]}~{llm_range[1]}') - + vit_start_idx = 0 llm_start_idx = 0 return_dicts = [] @@ -267,7 +305,8 @@ def split_model_by_pipeline(state_dict, pp_split): llm_start_idx = llm_end_idx return_dicts.append(new_dict) return return_dicts, copy_dict - + + def save_by_pp(_state_dicts, _save_dir, _lastest_checkpointed_iteration='release', _exists_ok=False): if os.path.exists(_save_dir): if not _exists_ok: @@ -305,24 +344,20 @@ def save_by_pp(_state_dicts, _save_dir, _lastest_checkpointed_iteration='release if __name__ == "__main__": - # hg_ckpt_dir = "Qwen2-VL-7B-Instruct" - # mm_save_dir = 'ckpt/Qwen2-VL-7B-Instruct' - hg_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-7B-Instruct' # huggingface权重目录 - mm_save_dir = 'ckpt/mm_path/Qwen2-VL-7B-Instruct' # 转换后保存目录 - - vit_hidden_size = 1280 - vit_attention_heads_num = 16 + hf_ckpt_dir = "ckpt/hf_path/Qwen2-VL-7B-Instruct" # hugging face原始的权重保存路径 + mm_save_dir = 'ckpt/mm_path/Qwen2-VL-7B-Instruct' # 转换后的权重保存路径 + model_size = "7B" # 根据需要转换的模型,指定配置( 2B 7B 72B ) + #model parameters + model_config = MODEL_CONFIG_DICT[model_size] - #for 7B + #PP parameters: 7B pp_size = 4 - vit_num_layers = 32 vit_pipeline_num_layers = [32, 0, 0, 0] - llm_num_layers = 28 llm_pipeline_num_layers = [1, 6, 11, 10] - state_dict = load_from_hf(hg_ckpt_dir) - state_dict = convert_hg_to_mm(state_dict, llm_num_layers, vit_hidden_size, vit_attention_heads_num) - pp_split = merge_pp_index(pp_size, vit_num_layers, vit_pipeline_num_layers, llm_num_layers, llm_pipeline_num_layers) + state_dict = load_from_hf(hf_ckpt_dir) + state_dict = convert_hf_to_mm(state_dict, model_config["llm_num_layers"], model_config["vit_hidden_size"], model_config["vit_num_attention_heads"], model_config["llm_num_query_groups"]) + pp_split = merge_pp_index(pp_size, model_config["vit_num_layers"], vit_pipeline_num_layers, model_config["llm_num_layers"], llm_pipeline_num_layers) state_dicts, remains = split_model_by_pipeline(state_dict, pp_split) if len(remains) > 0: print(remains) diff --git a/inference_vlm.py b/inference_vlm.py index 4024f046..d168bfd1 100644 --- a/inference_vlm.py +++ b/inference_vlm.py @@ -1,113 +1,26 @@ -import os import torch -from time import time, sleep -from concurrent.futures import ThreadPoolExecutor import mindspeed.megatron_adaptor from megatron.training import get_args from mindspeed_mm.tasks.inference.pipeline import vlm_pipeline_dict from mindspeed_mm.configs.config import mm_extra_args_provider -from redis_utils import RedisManager - - -def inference_and_write(batch_requests, pipeline, redis: RedisManager, model_name: str): - # Extract input data - sys_prompts = [req["sys_prompt"] for req in batch_requests] - queries = [req["query"] for req in batch_requests] - main_ids = [req["message_id"] for req in batch_requests] - - full_queries = sys_prompts[0] + queries[0] - - start_time = time() - outputs = pipeline(prompt=full_queries, return_ids=True) - print(f"Inference time: {time() - start_time}") - - if not isinstance(outputs, list): - outputs = [outputs] - print(outputs) - - def write_to_redis(message_id, output): - while True: - try: - redis.write_data_to_result(model_name, message_id, {"output": output}) - break - except Exception as e: - continue - - # Multithreaded writing to Redis - with ThreadPoolExecutor() as executor: - futures = [ - executor.submit(write_to_redis, message_id, output) - for message_id, output in zip(main_ids, outputs) - ] - - # Wait until all futures are completed - for future in futures: - future.result() - def main(): from megatron.training.initialize import initialize_megatron from mindspeed_mm.configs.config import merge_mm_args + # just inference torch.set_grad_enabled(False) initialize_megatron( - extra_args_provider=mm_extra_args_provider, - args_defaults={"tokenizer_type": "GPT2BPETokenizer"}, + extra_args_provider=mm_extra_args_provider, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'} ) args = get_args() merge_mm_args(args) inference_config = args.mm.model - - pipeline = vlm_pipeline_dict[inference_config.pipeline_class](inference_config) - - # Redis - assert os.environ[ - "MODEL_NAME" - ], "Environment variable MODEL_NAME was not set. Please set it manually." - model_name = os.environ["MODEL_NAME"] - assert os.environ[ - "REDIS_URL" - ], "Environment variable REDIS_URL was not set. Please set it manually." - redis_url = os.environ["REDIS_URL"] - assert os.environ[ - "REDIS_PORT" - ], "Environment variable REDIS_PORT was not set. Please set it manually." - redis_port = os.environ["REDIS_PORT"] - assert os.environ[ - "REDIS_DB" - ], "Environment variable REDIS_DB was not set. Please set it manually." - redis_db = os.environ["REDIS_DB"] - redis = RedisManager( - host=redis_url, port=redis_port, db=redis_db, model_name=model_name - ) - - batch_size = 1 # Define your batch size - - while True: - try: - batch_requests = redis.fetch_batch_of_requests(batch_size) - except Exception as e: - continue - - if batch_requests: - # Padding to batch size - while len(batch_requests) < batch_size: - batch_requests.append( - { - "query": "", - "sys_prompt": "", - "message_id": "dummy", - } - ) - - inference_and_write(batch_requests, pipeline, redis, model_name) - else: - sleep(0.01) + vlm_pipeline_dict[inference_config.pipeline_class](inference_config)() -if __name__ == "__main__": - with torch.inference_mode(): - main() +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/mindspeed_mm/models/ae/contextparallelcausalvae.py b/mindspeed_mm/models/ae/contextparallelcausalvae.py index f273eaf5..bad49ca0 100644 --- a/mindspeed_mm/models/ae/contextparallelcausalvae.py +++ b/mindspeed_mm/models/ae/contextparallelcausalvae.py @@ -2,6 +2,7 @@ from typing import Tuple import os import torch +import safetensors from torch import nn from einops import rearrange import numpy as np @@ -214,6 +215,8 @@ class ContextParallelCasualVAE(MultiModalModule): return torch.tensor_split(x, split_size, dim=0) def encode(self, x, enable_cp=True): + if self.cp_size <= 1: + enable_cp = False if not enable_cp: return self._encode(x, enable_cp=False) @@ -225,7 +228,7 @@ class ContextParallelCasualVAE(MultiModalModule): data_list = data_list[::self.dp_group_nums] latents = [] for data in data_list: - latents.append(self._encode(data)) + latents.append(self._encode(data, enable_cp=enable_cp)) return latents[get_context_parallel_group_rank() % self.dp_group_nums] elif self.dp_group_nums % self.cp_size == 0 and self.cp_size < self.dp_group_nums: @@ -234,7 +237,7 @@ class ContextParallelCasualVAE(MultiModalModule): data_list = self._bs_split_and_pad(x, self.dp_group_nums // self.cp_size) data = data_list[get_context_parallel_rank() % (self.dp_group_nums // self.cp_size)] - _latent = self._encode(data) + _latent = self._encode(data, enable_cp=enable_cp) if mpu.get_tensor_model_parallel_world_size() > 1: latents_tp = [torch.empty_like(_latent) for _ in range(mpu.get_tensor_model_parallel_world_size())] @@ -254,7 +257,7 @@ class ContextParallelCasualVAE(MultiModalModule): return latents[:bs] elif self.cp_size == self.dp_group_nums: - return self._encode(x) + return self._encode(x, enable_cp=enable_cp) else: raise NotImplementedError(f"Not supported megatron data parallel group nums {self.dp_group_nums} and VAE cp_size {self.cp_size}!") @@ -280,8 +283,10 @@ class ContextParallelCasualVAE(MultiModalModule): return res - def decode(self, z, **kwargs): - if self.cp_size > 0: + def decode(self, z, enable_cp: bool = True, **kwargs): + if self.cp_size <= 1: + enable_cp = False + if self.cp_size > 0 and enable_cp: global_src_rank = get_context_parallel_group_rank() * self.cp_size torch.distributed.broadcast(z, src=global_src_rank, group=get_context_parallel_group()) @@ -291,13 +296,13 @@ class ContextParallelCasualVAE(MultiModalModule): if (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size or z.shape[-3] > self.tile_latent_min_size_t): - dec = self.tiled_decode(z) + dec = self.tiled_decode(z, enable_cp=enable_cp) else: if self.use_quant_layer: z = self.post_quant_conv(z) - dec = self.decoder(z) + dec = self.decoder(z, enable_cp=enable_cp) - if self.cp_size > 0: + if self.cp_size > 0 and enable_cp: dec = _conv_gather(dec, dim=2, kernel_size=1) return dec @@ -349,7 +354,7 @@ class ContextParallelCasualVAE(MultiModalModule): posterior = DiagonalGaussianDistribution(moments) return posterior - def tiled_decode(self, x): + def tiled_decode(self, x, enable_cp=True): t = x.shape[2] t_chunk_idx = [i for i in range(0, t, self.tile_latent_min_size_t - 1)] if len(t_chunk_idx) == 1 and t_chunk_idx[0] == 0: @@ -365,9 +370,9 @@ class ContextParallelCasualVAE(MultiModalModule): for idx, (start, end) in enumerate(t_chunk_start_end): chunk_x = x[:, :, start: end] if idx != 0: - dec = self.tiled_decode2d(chunk_x)[:, :, 1:] + dec = self.tiled_decode2d(chunk_x, enable_cp=enable_cp)[:, :, 1:] else: - dec = self.tiled_decode2d(chunk_x) + dec = self.tiled_decode2d(chunk_x, enable_cp=enable_cp) dec_.append(dec) dec_ = torch.cat(dec_, dim=2) return dec_ @@ -410,7 +415,7 @@ class ContextParallelCasualVAE(MultiModalModule): return moments return posterior - def tiled_decode2d(self, z): + def tiled_decode2d(self, z, enable_cp=True): overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor)) blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor) row_limit = self.tile_sample_min_size - blend_extent @@ -427,7 +432,7 @@ class ContextParallelCasualVAE(MultiModalModule): ] if self.use_quant_layer: tile = self.post_quant_conv(tile) - decoded = self.decoder(tile) + decoded = self.decoder(tile, enable_cp=enable_cp) row.append(decoded) rows.append(row) result_rows = [] @@ -605,16 +610,18 @@ class Encoder(nn.Module): ) def forward(self, x, enable_cp=True): - h = self.conv_in(x, enable_cp=enable_cp) + hs = [self.conv_in(x, enable_cp=enable_cp)] for i_level in range(self.num_resolutions): for i_block in range(self.num_res_blocks): - h = self.down[i_level].block[i_block](h, enable_cp=enable_cp) + h = self.down[i_level].block[i_block](hs[-1], enable_cp=enable_cp) if len(self.down[i_level].attn) > 0: h = self.down[i_level].attn[i_block](h) + hs.append(h) if i_level != self.num_resolutions - 1: - h = self.down[i_level].downsample(h) + hs.append(self.down[i_level].downsample(hs[-1])) + h = hs[-1] h = self.mid.block_1(h, enable_cp=enable_cp) if self.enbale_attn1: h = self.mid.attn_1(h, enable_cp=enable_cp) @@ -753,26 +760,26 @@ class Decoder(nn.Module): block_in, 3, kernel_size=3, padding=conv_padding ) - def forward(self, z, **kwargs): + def forward(self, z, enable_cp=True, **kwargs): zq = z h = self.conv_in(z) - h = self.mid.block_1(h, zq=zq) + h = self.mid.block_1(h, zq=zq, enable_cp=enable_cp) if self.enable_attention: h = self.mid.attn_1(h) - h = self.mid.block_2(h, zq=zq) + h = self.mid.block_2(h, zq=zq, enable_cp=enable_cp) for i_level in reversed(range(self.num_resolutions)): for i_block in range(self.num_res_blocks + 1): - h = self.up[i_level].block[i_block](h, zq=zq) + h = self.up[i_level].block[i_block](h, zq=zq, enable_cp=enable_cp) if len(self.up[i_level].attn) > 0: - h = self.up[i_level].attn[i_block](h, zq=zq) + h = self.up[i_level].attn[i_block](h, zq=zq, enable_cp=enable_cp) if hasattr(self.up[i_level], "upsample"): - h = self.up[i_level].upsample(h) + h = self.up[i_level].upsample(h, enable_cp=enable_cp) if hasattr(self.up[i_level], "time_upsample"): h = self.up[i_level].time_upsample(h) - h = self.norm_out(h, zq=zq) + h = self.norm_out(h, zq=zq, enable_cp=enable_cp) if self.enable_nonlinearity: h = self.nonlinearity(h) h = self.conv_out(h) diff --git a/mindspeed_mm/models/common/attention.py b/mindspeed_mm/models/common/attention.py index a0dfebb1..8bb7da0b 100644 --- a/mindspeed_mm/models/common/attention.py +++ b/mindspeed_mm/models/common/attention.py @@ -398,6 +398,10 @@ class SelfAttentionBNSD(nn.Module): if self.qk_ln: self.q_norm = nn.LayerNorm(self.head_dim, eps=1e-6) self.k_norm = nn.LayerNorm(self.head_dim, eps=1e-6) + for param in self.q_norm.parameters(): + setattr(param, "sequence_parallel", True) + for param in self.k_norm.parameters(): + setattr(param, "sequence_parallel", True) key_dim = key_dim if key_dim is not None else query_dim @@ -438,6 +442,7 @@ class SelfAttentionBNSD(nn.Module): frames: Optional[int] = None, height: Optional[int] = None, width: Optional[int] = None, + **kwargs ) -> torch.Tensor: """ Args: @@ -467,8 +472,8 @@ class SelfAttentionBNSD(nn.Module): k = self.k_norm(k) if self.use_rope and self.rope is not None: - q = self.rope(q) - k = self.rope(k) + q = self.rope(q, **kwargs) + k = self.rope(k, **kwargs) out = torch_npu.npu_fusion_attention( q, @@ -529,6 +534,10 @@ class ParallelSelfAttentionSBH(nn.Module): if self.qk_ln: self.q_norm = nn.LayerNorm(self.head_dim, eps=1e-6) self.k_norm = nn.LayerNorm(self.head_dim, eps=1e-6) + for param in self.q_norm.parameters(): + setattr(param, "sequence_parallel", True) + for param in self.k_norm.parameters(): + setattr(param, "sequence_parallel", True) key_dim = key_dim if key_dim is not None else query_dim @@ -577,9 +586,10 @@ class ParallelSelfAttentionSBH(nn.Module): mask: The attention mask to use. **kwargs: Additional keyword arguments to pass along """ - sequence_length, batch_size, _ = query.shape q, k, v = self.proj_qkv(query)[0].chunk(3, dim=2) + sequence_length, batch_size, _ = q.shape + q = q.view(-1, self.num_attention_heads_per_partition, self.head_dim) k = k.view(-1, self.num_attention_heads_per_partition, self.head_dim) v = v.view(-1, self.num_attention_heads_per_partition, self.head_dim) @@ -673,7 +683,8 @@ class ParallelMultiHeadAttentionSBH(nn.Module): mask: Optional[torch.Tensor] = None, frames: Optional[int] = None, height: Optional[int] = None, - width: Optional[int] = None + width: Optional[int] = None, + **kwargs ) -> torch.Tensor: """ Args: diff --git a/mindspeed_mm/models/common/embeddings/patch_embeddings.py b/mindspeed_mm/models/common/embeddings/patch_embeddings.py index 2c734897..c92eeda2 100644 --- a/mindspeed_mm/models/common/embeddings/patch_embeddings.py +++ b/mindspeed_mm/models/common/embeddings/patch_embeddings.py @@ -5,6 +5,9 @@ # LICENSE file in the root directory of this source tree. +from functools import reduce +from operator import mul + import torch import torch.nn.functional as F from einops import rearrange @@ -14,7 +17,6 @@ from torch import nn from .pos_embeddings import ( get_1d_sincos_pos_embed, get_2d_sincos_pos_embed, - get_3d_sincos_pos_embed, ) @@ -598,7 +600,7 @@ class VideoPatch2D(nn.Module): super().__init__() self.proj = nn.Conv2d(in_channels, hidden_size, kernel_size=patch_size, stride=patch_size, bias=bias) - def forward(self, latent, encoder_outputs): + def forward(self, latent, encoder_outputs, **kwargs): latent = latent.transpose(1, 2) b, t = latent.shape[:2] emb = latent.view(-1, *latent.shape[2:]) @@ -608,4 +610,37 @@ class VideoPatch2D(nn.Module): emb = rearrange(emb, "b t n d -> b (t n) d") emb = emb.contiguous() - return emb, None # (b,n_t+t*n_i,d) \ No newline at end of file + return emb, None # (b,n_t+t*n_i,d) + + +class VideoPatch3D(nn.Module): + """ + 3D Image to Patch Embedding concat witch text embedding + """ + def __init__( + self, + in_channels, + hidden_size, + patch_size, + ): + super().__init__() + self.patch_size = patch_size + self.proj = nn.Linear(in_channels * reduce(mul, patch_size), hidden_size) + + def forward(self, latent, encoder_outputs, **kwargs): + latent = latent.transpose(1, 2) + emb = rearrange(latent, "b t c h w -> b (t h w) c") + emb = rearrange( + emb, + "b (t o h p w q) c -> b (t h w) (c o p q)", + t=kwargs["rope_T"], + h=kwargs["rope_H"], + w=kwargs["rope_W"], + o=self.patch_size[0], + p=self.patch_size[1], + q=self.patch_size[2], + ) + emb = self.proj(emb) + emb = emb.contiguous() + + return emb, None \ No newline at end of file diff --git a/mindspeed_mm/models/common/embeddings/pos_embeddings.py b/mindspeed_mm/models/common/embeddings/pos_embeddings.py index ca8dc26d..420d2e6c 100644 --- a/mindspeed_mm/models/common/embeddings/pos_embeddings.py +++ b/mindspeed_mm/models/common/embeddings/pos_embeddings.py @@ -531,13 +531,10 @@ class Rotary3DPositionEmbedding(nn.Module): freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2) freqs = broad_cat((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1) - freqs = rearrange(freqs, "t h w d -> (t h w) d") freqs = freqs.contiguous() - freqs_sin = freqs.sin() - freqs_cos = freqs.cos() - self.register_buffer("freqs_sin", freqs_sin) - self.register_buffer("freqs_cos", freqs_cos) + self.freqs_sin = freqs.sin().npu() + self.freqs_cos = freqs.cos().npu() self.text_length = text_length if learnable_pos_embed: @@ -546,11 +543,16 @@ class Rotary3DPositionEmbedding(nn.Module): else: self.pos_embedding = None - def rotary(self, t): + def rotary(self, t, **kwargs): # input shape: bnsd - seq_len = t.shape[2] - freqs_cos = self.freqs_cos[:seq_len].unsqueeze(0).unsqueeze(0) - freqs_sin = self.freqs_sin[:seq_len].unsqueeze(0).unsqueeze(0) + def reshape_freq(freqs): + freqs = freqs[: kwargs["rope_T"], : kwargs["rope_H"], : kwargs["rope_W"]].contiguous() + freqs = rearrange(freqs, "t h w d -> (t h w) d") + freqs = freqs.unsqueeze(0).unsqueeze(0) + return freqs + + freqs_cos = reshape_freq(self.freqs_cos).to(t.dtype) + freqs_sin = reshape_freq(self.freqs_sin).to(t.dtype) return npu_rotary_position_embedding(t, freqs_cos, freqs_sin, mode=1) @@ -560,7 +562,7 @@ class Rotary3DPositionEmbedding(nn.Module): else: return None - def forward(self, x): + def forward(self, x, **kwargs): # input shape: bnsd - x[:, :, self.text_length:] = self.rotary(x[:, :, self.text_length:]) + x[:, :, self.text_length:] = self.rotary(x[:, :, self.text_length:], **kwargs) return x diff --git a/mindspeed_mm/models/common/normalize.py b/mindspeed_mm/models/common/normalize.py index 3590b73a..15356594 100644 --- a/mindspeed_mm/models/common/normalize.py +++ b/mindspeed_mm/models/common/normalize.py @@ -4,6 +4,7 @@ from einops import rearrange from mindspeed_mm.models.common.communications import _conv_split, _conv_gather from mindspeed_mm.models.common.conv import ContextParallelCausalConv3d +from mindspeed_mm.utils.utils import get_context_parallel_rank class LayerNorm(nn.Module): @@ -142,15 +143,28 @@ class SpatialNorm3D(nn.Module): ) def forward(self, f, zq, clear_fake_cp_cache=True, enable_cp=True): - if f.shape[2] > 1 and f.shape[2] % 2 == 1: + if f.shape[2] > 1 and get_context_parallel_rank() == 0 and enable_cp: f_first, f_rest = f[:, :, :1], f[:, :, 1:] f_first_size, f_rest_size = f_first.shape[-3:], f_rest.shape[-3:] zq_first, zq_rest = zq[:, :, :1], zq[:, :, 1:] zq_first = torch.nn.functional.interpolate(zq_first, size=f_first_size, mode="nearest") - zq_rest = torch.nn.functional.interpolate(zq_rest, size=f_rest_size, mode="nearest") + + zq_rest_splits = torch.split(zq_rest, 32, dim=1) + interpolated_splits = [ + torch.nn.functional.interpolate(split, size=f_rest_size, mode="nearest") for split in zq_rest_splits + ] + + zq_rest = torch.cat(interpolated_splits, dim=1) + zq = torch.cat([zq_first, zq_rest], dim=2) else: - zq = torch.nn.functional.interpolate(zq, size=f.shape[-3:], mode="nearest") + f_size = f.shape[-3:] + + zq_splits = torch.split(zq, 32, dim=1) + interpolated_splits = [ + torch.nn.functional.interpolate(split, size=f_size, mode="nearest") for split in zq_splits + ] + zq = torch.cat(interpolated_splits, dim=1) if self.add_conv: zq = self.conv(zq, clear_cache=clear_fake_cp_cache, enable_cp=enable_cp) diff --git a/mindspeed_mm/models/common/updownsample.py b/mindspeed_mm/models/common/updownsample.py index 939839d7..92b9c148 100644 --- a/mindspeed_mm/models/common/updownsample.py +++ b/mindspeed_mm/models/common/updownsample.py @@ -9,6 +9,7 @@ import torch.nn.functional as F from mindspeed_mm.utils.utils import cast_tuple, video_to_image from mindspeed_mm.models.common.conv import CausalConv3d, WfCausalConv3d +from mindspeed_mm.utils.utils import get_context_parallel_rank class Upsample(nn.Module): @@ -79,21 +80,29 @@ class DownSample3D(nn.Module): self.conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=0) self.compress_time = compress_time - def forward(self, x): + def forward(self, x, enable_cp=True): if self.compress_time and x.shape[2] > 1: h, w = x.shape[-2:] x = rearrange(x, "b c t h w -> (b h w) c t") - if x.shape[-1] % 2 == 1: + if get_context_parallel_rank() == 0 and enable_cp: # split first frame x_first, x_rest = x[..., 0], x[..., 1:] if x_rest.shape[-1] > 0: - x_rest = torch.nn.functional.avg_pool1d(x_rest, kernel_size=2, stride=2) + splits = torch.split(x_rest, 32, dim=1) + interpolated_splits = [ + torch.nn.functional.avg_pool1d(split, kernel_size=2, stride=2) for split in splits + ] + x_rest = torch.cat(interpolated_splits, dim=1) x = torch.cat([x_first[..., None], x_rest], dim=-1) x = rearrange(x, "(b h w) c t -> b c t h w", h=h, w=w) else: - x = torch.nn.functional.avg_pool1d(x, kernel_size=2, stride=2) + splits = torch.split(x, 32, dim=1) + interpolated_splits = [ + torch.nn.functional.avg_pool1d(split, kernel_size=2, stride=2) for split in splits + ] + x = torch.cat(interpolated_splits, dim=1) x = rearrange(x, "(b h w) c t -> b c t h w", h=h, w=w) if self.with_conv: @@ -119,24 +128,39 @@ class Upsample3D(nn.Module): self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1) self.compress_time = compress_time - def forward(self, x): - if self.compress_time: - if x.shape[2] > 1: + def forward(self, x, enable_cp=True): + if self.compress_time and x.shape[2] > 1: + if get_context_parallel_rank() == 0 and enable_cp: # split first frame x_first, x_rest = x[:, :, 0], x[:, :, 1:] x_first = torch.nn.functional.interpolate(x_first, scale_factor=2.0, mode="nearest") x_rest = torch.nn.functional.interpolate(x_rest, scale_factor=2.0, mode="nearest") + + splits = torch.split(x_rest, 32, dim=1) + interpolated_splits = [ + torch.nn.functional.interpolate(split, scale_factor=2.0, mode="nearest") for split in splits + ] + x_rest = torch.cat(interpolated_splits, dim=1) + x = torch.cat([x_first[:, :, None, :, :], x_rest], dim=2) else: - x = x.squeeze(2) - x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest") - x = x[:, :, None, :, :] + splits = torch.split(x, 32, dim=1) + interpolated_splits = [ + torch.nn.functional.interpolate(split, scale_factor=2.0, mode="nearest") for split in splits + ] + x = torch.cat(interpolated_splits, dim=1) else: # only interpolate 2D t = x.shape[2] x = rearrange(x, "b c t h w -> (b t) c h w") - x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest") + + splits = torch.split(x, 32, dim=1) + interpolated_splits = [ + torch.nn.functional.interpolate(split, scale_factor=2.0, mode="nearest") for split in splits + ] + x = torch.cat(interpolated_splits, dim=1) + x = rearrange(x, "(b t) c h w -> b c t h w", t=t) if self.with_conv: diff --git a/mindspeed_mm/models/diffusion/cogvideo_diffusion.py b/mindspeed_mm/models/diffusion/cogvideo_diffusion.py index 553a211f..4086bb71 100644 --- a/mindspeed_mm/models/diffusion/cogvideo_diffusion.py +++ b/mindspeed_mm/models/diffusion/cogvideo_diffusion.py @@ -30,25 +30,30 @@ def append_zero(x): class DiscreteSampling: - def __init__(self, discretization_config, num_idx, do_append_zero=False, flip=True, uniform_sampling=False): + def __init__(self, discretization_config, num_idx, do_append_zero=False, flip=True, uniform_sampling=False, + group_num=None): self.num_idx = num_idx self.sigmas = ZeroSNRDDPMDiscretization(**discretization_config)(num_idx, do_append_zero=do_append_zero, flip=flip) world_size = mpu.get_data_parallel_world_size() self.uniform_sampling = uniform_sampling - if self.uniform_sampling: - i = 1 - while True: - if world_size % i != 0 or num_idx % (world_size // i) != 0: - i += 1 - else: - self.group_num = world_size // i - break + if group_num: + self.group_num = group_num + else: + if self.uniform_sampling: + i = 1 + while True: + if world_size % i != 0 or num_idx % (world_size // i) != 0: + i += 1 + else: + self.group_num = world_size // i + break + if self.uniform_sampling: if self.group_num <= 0: - raise ValueError + raise ValueError("group_num should not be less than or equal to 0") if world_size % self.group_num != 0: - raise ValueError + raise ValueError("The remainder of world_size to group_num should be equal to 0") self.group_width = world_size // self.group_num # the number of rank in one group self.sigma_interval = self.num_idx // self.group_num @@ -91,6 +96,8 @@ def make_beta_schedule( ): if schedule == "linear": betas = torch.linspace(linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64) ** 2 + else: + raise NotImplementedError("Only support linear schedule") return betas.numpy() @@ -301,17 +308,21 @@ class CogVideoDiffusion(nn.Module): self.latents = latents self.x_start = noised_input - return noised_input * self.c_in, self.c_noise, idx + kwargs["model_kwargs"]["c_out"] = self.c_out + kwargs["model_kwargs"]["noised_start"] = self.x_start * self.c_skip + kwargs["model_kwargs"]["alphas_cumprod"] = self.alphas_cumprod_sqrt + + return noised_input * self.c_in, noise, idx - def training_losses(self, model_output, **kwargs): - model_output = model_output * self.c_out + self.x_start * self.c_skip + def training_losses(self, model_output, x_start, **kwargs): + model_output = model_output * kwargs['c_out'] + kwargs["noised_start"] - w = append_dims(1 / (1 - self.alphas_cumprod_sqrt ** 2), self.x_start.ndim) # v-pred + w = append_dims(1 / (1 - kwargs['alphas_cumprod'] ** 2), x_start.ndim) # v-pred if self.min_snr_value is not None: w = min(w, self.min_snr_value) - return self.get_loss(model_output, self.latents, w) + return self.get_loss(model_output, x_start, w) def get_loss(self, model_output, target, w): model_output = model_output.transpose(1, 2) diff --git a/mindspeed_mm/models/diffusion/diffusers_scheduler.py b/mindspeed_mm/models/diffusion/diffusers_scheduler.py index a3ab7916..8b372235 100644 --- a/mindspeed_mm/models/diffusion/diffusers_scheduler.py +++ b/mindspeed_mm/models/diffusion/diffusers_scheduler.py @@ -245,6 +245,9 @@ class DiffusersScheduler: with torch.no_grad(): noise_pred = model(timestep=current_timestep, **model_kwargs) + + if isinstance(noise_pred, tuple) or isinstance(noise_pred, list): + noise_pred = noise_pred[0] # perform guidance if use_dynamic_cfg: diff --git a/mindspeed_mm/models/predictor/dits/sat_dit.py b/mindspeed_mm/models/predictor/dits/sat_dit.py index 5d7f382e..1da75f58 100644 --- a/mindspeed_mm/models/predictor/dits/sat_dit.py +++ b/mindspeed_mm/models/predictor/dits/sat_dit.py @@ -1,21 +1,22 @@ -from curses import KEY_A1 +from functools import reduce +from operator import mul from typing import Optional, Tuple, Dict +from contextlib import nullcontext -from einops import rearrange, repeat import torch -from torch import nn -import torch.nn.functional as F from diffusers.models.embeddings import SinusoidalPositionalEmbedding +from einops import rearrange from megatron.core import mpu, tensor_parallel from megatron.training import get_args from megatron.training.arguments import core_transformer_config_from_args +from torch import nn from mindspeed_mm.models.common.ffn import FeedForward as TensorParallelFeedForward from mindspeed_mm.models.common.communications import split_forward_gather_backward, gather_forward_split_backward from mindspeed_mm.models.common.embeddings.pos_embeddings import Rotary3DPositionEmbedding from mindspeed_mm.models.common.embeddings.time_embeddings import TimeStepEmbedding from mindspeed_mm.models.common.module import MultiModalModule -from mindspeed_mm.models.common.embeddings.patch_embeddings import VideoPatchEmbed2D, VideoPatch2D +from mindspeed_mm.models.common.embeddings.patch_embeddings import VideoPatchEmbed2D, VideoPatch2D, VideoPatch3D from mindspeed_mm.models.common.attention import SelfAttentionBNSD, ParallelSelfAttentionSBH @@ -61,6 +62,7 @@ class SatDiT(MultiModalModule): attention_bias: bool = False, input_size: Tuple[int] = None, patch_size: Tuple[int] = None, + patch_type: str = "2D", activation_fn: str = "geglu", norm_type: str = "layer_norm", num_embeds_ada_norm: Optional[int] = None, @@ -70,14 +72,20 @@ class SatDiT(MultiModalModule): use_rope: bool = False, interpolation_scale: Tuple[float] = None, elementwise_affine: bool = True, - text_length=None, - text_hidden_size=None, - time_embed_dim=None, - concat_text_embed=None, - learnable_pos_embed=False, + text_length: int = None, + text_hidden_size: int = None, + time_embed_dim: int = None, + concat_text_embed: bool = None, + learnable_pos_embed: bool = False, + pre_process: bool = True, + post_process: bool = True, + global_layer_idx: Optional[Tuple] = None, **kwargs ): super().__init__(config=None) + self.pre_process = pre_process + self.post_process = post_process + self.input_size = input_size # Validate inputs and init args. if patch_size is not None: if norm_type not in ["ada_norm", "ada_norm_zero", "ada_norm_single", "qk_ln"]: @@ -88,28 +96,25 @@ class SatDiT(MultiModalModule): raise ValueError( f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None." ) + self.patch_size = patch_size + self.patch_type = patch_type self.patch_size_t, self.patch_size_h, self.patch_size_w = patch_size self.norm_type = norm_type self.in_channels = in_channels self.out_channels = out_channels self.num_layers = num_layers self.concat_text_embed = concat_text_embed - t, h, w = input_size - seq_len = text_length + t // self.patch_size_t * h // self.patch_size_h * w // self.patch_size_w - seq_begin = (seq_len // mpu.get_context_parallel_world_size()) * mpu.get_context_parallel_rank() - seq_end = (seq_len // mpu.get_context_parallel_world_size()) * (mpu.get_context_parallel_rank() + 1) - if seq_end < text_length: - self.text_length = seq_len // mpu.get_context_parallel_world_size() - elif seq_begin > text_length: - self.text_length = 0 - else: - self.text_length = text_length - seq_begin + args = get_args() + self.sequence_parallel = args.sequence_parallel + self.text_length = self._get_text_length(input_size, text_length) + self.ori_text_length = text_length + self.seq_len = text_length + reduce(mul, input_size) // reduce(mul, patch_size) self.text_hidden_size = text_hidden_size self.elementwise_affine = elementwise_affine inner_dim = num_heads * head_dim + self.inner_dim = inner_dim self.time_embed_dim = time_embed_dim if time_embed_dim is not None else inner_dim - args = get_args() self.recompute_granularity = args.recompute_granularity self.distribute_saved_activations = args.distribute_saved_activations self.recompute_method = args.recompute_method @@ -125,9 +130,22 @@ class SatDiT(MultiModalModule): self.enable_sequence_parallelism = False # Initialize blocks + + if self.pre_process: # Init PatchEmbed - self.time_embed = TimeStepEmbedding(inner_dim, self.time_embed_dim) - self.patch_embed = VideoPatch2D(in_channels, inner_dim, self.patch_size_h) + self.time_embed = TimeStepEmbedding(inner_dim, self.time_embed_dim) + if self.patch_type == "3D": + self.patch_embed = VideoPatch3D(in_channels, inner_dim, self.patch_size) + else: + self.patch_embed = VideoPatch2D(in_channels, inner_dim, self.patch_size_h) + + # Init Projection + self.caption_projection = None + if text_hidden_size is not None: + self.caption_projection = nn.Linear(self.text_hidden_size, inner_dim) + + self.global_layer_idx = global_layer_idx if global_layer_idx is not None else tuple(range(num_layers)) + self.pos_embed = Rotary3DPositionEmbedding( hidden_size_head=head_dim, text_length=text_length, @@ -158,22 +176,52 @@ class SatDiT(MultiModalModule): enable_sequence_parallelism=self.enable_sequence_parallelism, time_embed_dim=self.time_embed_dim, text_length=self.text_length, + patch_size=self.patch_size ) for i in range(num_layers) ] ) - # Init Norm - self.norm_final = nn.LayerNorm(inner_dim, elementwise_affine=elementwise_affine, eps=1e-5) - self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(self.time_embed_dim, 2 * inner_dim, bias=True)) - self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=elementwise_affine, eps=1e-6) - self.proj_out = nn.Linear(inner_dim, - self.patch_size_t * self.patch_size_h * self.patch_size_w * self.out_channels) - # Init Projection - self.caption_projection = None - if text_hidden_size is not None: - self.caption_projection = nn.Linear(self.text_hidden_size, inner_dim) + + if self.post_process: + # Init Norm + self.norm_final = nn.LayerNorm(inner_dim, elementwise_affine=elementwise_affine, eps=1e-5) + config = core_transformer_config_from_args(args) + config.sequence_parallel = False + self.adaLN_modulation = nn.Sequential( + nn.SiLU(), + tensor_parallel.ColumnParallelLinear( + self.time_embed_dim, + 2 * inner_dim, + config=config, + init_method=config.init_method, + gather_output=False + ) + ) + self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=elementwise_affine, eps=1e-6) + self.proj_out = nn.Linear(inner_dim, reduce(mul, self.patch_size) * self.out_channels) + + for param in self.norm_final.parameters(): + setattr(param, "sequence_parallel", self.sequence_parallel) + for param in self.norm_out.parameters(): + setattr(param, "sequence_parallel", self.sequence_parallel) + print(self) + def _get_text_length(self, input_size, text_length): + t, h, w = input_size + cp = mpu.get_context_parallel_world_size() + tp_sp = mpu.get_tensor_model_parallel_world_size() if self.sequence_parallel else 1 + tp_sp_rank = mpu.get_tensor_model_parallel_rank() if self.sequence_parallel else 0 + seq_len = text_length + t * h * w // self.patch_size_t // self.patch_size_h // self.patch_size_w + seq_begin = (seq_len // cp // tp_sp) * (mpu.get_context_parallel_rank() * tp_sp + tp_sp_rank) + seq_end = (seq_len // cp // tp_sp) + seq_begin + if seq_end < text_length: + return seq_len // cp // tp_sp + elif seq_begin > text_length: + return 0 + else: + return text_length - seq_begin + def forward( self, latents: torch.Tensor, @@ -197,51 +245,53 @@ class SatDiT(MultiModalModule): class_labels: Used to indicate class labels conditioning. use_image_num: The number of images use for trainning. """ - b, _, t, _, _ = latents.shape + b = latents.shape[0] + t, h, w = self.input_size[0], self.input_size[1], self.input_size[2] frames = t - use_image_num vid_mask, img_mask = None, None prompt_vid_mask, prompt_img_mask = None, None + # RNG context + if self.sequence_parallel: + rng_context = tensor_parallel.get_cuda_rng_tracker().fork() + else: + rng_context = nullcontext() + # 1. Input frames = ((frames - 1) // self.patch_size_t + 1) if frames % 2 == 1 else frames // self.patch_size_t # patchfy - height, width = latents.shape[-2] // self.patch_size_h, latents.shape[-1] // self.patch_size_w - - if "masked_video" in kwargs.keys() and kwargs["masked_video"] is not None: - latents = torch.cat([latents, kwargs["masked_video"]], dim=1) - - added_cond_kwargs = {"resolution": None, "aspect_ratio": None} - latents_vid, latents_img, prompt_vid, prompt_img, timestep_vid, timestep_img, \ - embedded_timestep_vid, embedded_timestep_img = self._operate_on_patched_inputs( - latents, prompt, timestep, added_cond_kwargs, b, frames, use_image_num - ) - if self.concat_text_embed: - latents_vid = torch.cat((prompt_vid, latents_vid), dim=1) - - if self.enable_sequence_parallelism and latents_vid is not None and prompt_vid is not None: - latents_vid = rearrange(latents_vid, 'b s h -> s b h', b=b).contiguous() - latents_vid = split_forward_gather_backward(latents_vid, mpu.get_context_parallel_group(), dim=0, - grad_scale='down') + height, width = h // self.patch_size_h, w // self.patch_size_w + + if self.pre_process: + if "masked_video" in kwargs.keys() and kwargs["masked_video"] is not None: + latents = torch.cat([latents, kwargs["masked_video"]], dim=1) + + added_cond_kwargs = {"resolution": None, "aspect_ratio": None} + latents_vid, latents_img, prompt_vid, prompt_img, timestep_vid, timestep_img, \ + embedded_timestep_vid, embedded_timestep_img = self._operate_on_patched_inputs( + latents, prompt, timestep, frames) + if self.concat_text_embed: + latents_vid = torch.cat((prompt_vid, latents_vid), dim=1) + + if self.enable_sequence_parallelism or self.sequence_parallel: + latents_vid = latents_vid.transpose(0, 1).contiguous() + if self.enable_sequence_parallelism: + latents_vid = split_forward_gather_backward(latents_vid, mpu.get_context_parallel_group(), dim=0, + grad_scale='down') + if self.sequence_parallel: + latents_vid = tensor_parallel.scatter_to_sequence_parallel_region(latents_vid) + else: + latents_vid = latents + prompt_vid = prompt + timestep_vid = timestep frames = torch.tensor(frames) height = torch.tensor(height) width = torch.tensor(width) - if self.recompute_granularity == "full": - if latents_vid is not None: - latents_vid = self._checkpointed_forward( - latents_vid, - video_mask=vid_mask, - prompt=prompt_vid, - prompt_mask=prompt_vid_mask, - timestep=timestep_vid, - class_labels=class_labels, - frames=frames, - height=height, - width=width - ) - else: - for block in self.videodit_blocks: + + with rng_context: + if self.recompute_granularity == "full": if latents_vid is not None: - latents_vid = block( + latents_vid = self._checkpointed_forward( latents_vid, video_mask=vid_mask, prompt=prompt_vid, @@ -250,34 +300,50 @@ class SatDiT(MultiModalModule): class_labels=class_labels, frames=frames, height=height, - width=width + width=width, + rope_T=t, + rope_H=h, + rope_W=w ) + else: + for block in self.videodit_blocks: + if latents_vid is not None: + latents_vid = block( + latents_vid, + video_mask=vid_mask, + prompt=prompt_vid, + prompt_mask=prompt_vid_mask, + timestep=timestep_vid, + class_labels=class_labels, + frames=frames, + height=height, + width=width, + rope_T=torch.tensor(t / self.patch_size[0], dtype=torch.int), + rope_H=torch.tensor(h / self.patch_size[1], dtype=torch.int), + rope_W=torch.tensor(w / self.patch_size[2], dtype=torch.int) + ) - if self.enable_sequence_parallelism and latents_vid is not None: - latents_vid = rearrange(latents_vid, 's b h -> b s h', b=b).contiguous() - latents_vid = gather_forward_split_backward(latents_vid, mpu.get_context_parallel_group(), dim=1, - grad_scale='up') # 3. Output - output_vid, output_img = None, None - if latents_vid is not None: - output_vid = self._get_output_for_patched_inputs( - latents=latents_vid, - timestep=timestep_vid, - class_labels=class_labels, - embedded_timestep=embedded_timestep_vid, - num_frames=frames, - height=height, - width=width, - ) # [b, c, t, h, w] - - if output_vid is not None and output_img is not None: - output = torch.cat([output_vid, output_img], dim=2) - elif output_vid is not None: - output = output_vid - elif output_img is not None: - output = output_img - return output + if self.post_process: + output_vid, output_img = None, None + if latents_vid is not None: + output_vid = self._get_output_for_patched_inputs( + latents=latents_vid, + timestep=timestep_vid, + height=height, + width=width, + ) # [b, c, t, h, w] + + if output_vid is not None and output_img is not None: + output = torch.cat([output_vid, output_img], dim=2) + elif output_vid is not None: + output = output_vid + elif output_img is not None: + output = output_img + return output, prompt_vid, timestep_vid + else: + return latents_vid, prompt_vid, timestep_vid def _get_block(self, layer_number): return self.videodit_blocks[layer_number] @@ -292,7 +358,9 @@ class SatDiT(MultiModalModule): class_labels, frames, height, - width): + width, + **kwargs + ): """Forward method with activation checkpointing.""" def custom(start, end): @@ -321,7 +389,10 @@ class SatDiT(MultiModalModule): class_labels, frames, height, - width + width, + torch.tensor(kwargs["rope_T"] / self.patch_size[0], dtype=torch.int), + torch.tensor(kwargs["rope_H"] / self.patch_size[1], dtype=torch.int), + torch.tensor(kwargs["rope_W"] / self.patch_size[2], dtype=torch.int) ) layer_num += self.recompute_num_layers elif self.recompute_method == "block": @@ -338,7 +409,10 @@ class SatDiT(MultiModalModule): class_labels, frames, height, - width + width, + torch.tensor(kwargs["rope_T"] / self.patch_size[0], dtype=torch.int), + torch.tensor(kwargs["rope_H"] / self.patch_size[1], dtype=torch.int), + torch.tensor(kwargs["rope_W"] / self.patch_size[2], dtype=torch.int) ) else: block = self._get_block(layer_num) @@ -351,7 +425,10 @@ class SatDiT(MultiModalModule): class_labels=class_labels, frames=frames, height=height, - width=width + width=width, + rope_T=kwargs["rope_T"], + rope_H=kwargs["rope_H"], + rope_W=kwargs["rope_W"] ) else: raise ValueError("Invalid activation recompute method.") @@ -367,17 +444,23 @@ class SatDiT(MultiModalModule): buffers = tuple(self.buffers()) return buffers[0].dtype - def _operate_on_patched_inputs(self, latents, prompt, timestep, added_cond_kwargs, batch_size, frames, - use_image_num): + def _operate_on_patched_inputs(self, latents, prompt, timestep, frames): + b, _, t, h, w = latents.shape if self.pos_embed is not None: - latents_vid, latents_img = self.patch_embed(latents.to(self.dtype), prompt) + latents_vid, latents_img = self.patch_embed(latents.to(self.dtype), prompt, + rope_T=t // self.patch_size[0], + rope_H=h // self.patch_size[1], + rope_W=w // self.patch_size[2]) _, seq_len, _ = latents_vid.shape pos_emb = self.pos_embed.position_embedding_forward(latents.to(self.dtype), seq_length=seq_len - self.text_length) if pos_emb is not None: latents_vid = latents_vid + pos_emb else: - latents_vid, latents_img = self.patch_embed(latents.to(self.dtype), frames) + latents_vid, latents_img = self.patch_embed(latents.to(self.dtype), frames, + rope_T=t // self.patch_size[0], + rope_H=h // self.patch_size[1], + rope_W=w // self.patch_size[2]) timestep_vid, timestep_img = None, None embedded_timestep_vid, embedded_timestep_img = None, None prompt_vid, prompt_img = None, None @@ -395,12 +478,29 @@ class SatDiT(MultiModalModule): return latents_vid, latents_img, prompt_vid, prompt_img, timestep_vid, timestep_img, embedded_timestep_vid, embedded_timestep_img - def _get_output_for_patched_inputs(self, latents, timestep, class_labels, embedded_timestep, num_frames, - height=None, width=None): + def _get_output_for_patched_inputs(self, latents, timestep, height=None, width=None): x = self.norm_final(latents) + _scale_shift_table = self.adaLN_modulation(timestep)[0] + if self.sequence_parallel: + _scale_shift_table = tensor_parallel.mappings.all_gather_last_dim_from_tensor_parallel_region( + _scale_shift_table + ) + else: + _scale_shift_table = tensor_parallel.mappings.gather_from_tensor_model_parallel_region( + _scale_shift_table + ) + if self.sequence_parallel or self.enable_sequence_parallelism: + shift, scale = _scale_shift_table.unsqueeze(0).chunk(2, dim=2) + else: + shift, scale = _scale_shift_table.unsqueeze(1).chunk(2, dim=2) + x = self.norm_out(x) * (1 + scale) + shift + if self.sequence_parallel: + x = tensor_parallel.gather_from_sequence_parallel_region(x, tensor_parallel_output_grad=False) + if self.sequence_parallel or self.enable_sequence_parallelism: + x = x.transpose(0, 1).contiguous() + if self.enable_sequence_parallelism: + x = gather_forward_split_backward(x, mpu.get_context_parallel_group(), dim=1, grad_scale="up") x = x[:, self.pos_embed.text_length:, :] - shift, scale = self.adaLN_modulation(timestep).chunk(2, dim=1) - x = self.norm_out(x) * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1) x = self.proj_out(x) latents = x @@ -411,6 +511,63 @@ class SatDiT(MultiModalModule): c=self.out_channels).transpose(1, 2) return output + def initialize_pipeline_tensor_shapes(self): + args = get_args() + micro_batch_size = args.mm.data.dataloader_param.batch_size + dtype = args.params_dtype + latent_size = (self.out_channels, *self.input_size) + seq_len = self.seq_len + + if self.enable_sequence_parallelism or self.sequence_parallel: + prev_output_shape = (seq_len // mpu.get_context_parallel_world_size(), micro_batch_size, self.inner_dim) # SBH + else: + prev_output_shape = (micro_batch_size, seq_len, self.inner_dim) # BSH + + pipeline_tensor_shapes = [ + {'shape': prev_output_shape, 'dtype': dtype}, # prev_stage_output + {'shape': (micro_batch_size, *latent_size), 'dtype': dtype}, # latents + {'shape': (micro_batch_size, self.ori_text_length, self.inner_dim), 'dtype': dtype}, # prompt + {'shape': (micro_batch_size, self.time_embed_dim), 'dtype': dtype}, # embedded_timestep + {'shape': (micro_batch_size, *latent_size), 'dtype': torch.float32}, # video_diffusion: self.noised_start + {'shape': (micro_batch_size, 1, 1, 1, 1), 'dtype': torch.float32}, # video_diffusion: self.c_out + {'shape': (micro_batch_size), 'dtype': torch.float32}, # video_diffusion: self.alpha_cumprod + ] + return pipeline_tensor_shapes + + def pipeline_set_prev_stage_tensor(self, input_tensor_list, extra_kwargs=None): + """ + Process tensor from prev_pipeline_stage, and adjust to predictor input and training loss input. + Input: + input_tensor_list: + model_output, latents, predictor_prompt, predictor_timesteps, + extra_kwargs (extra parameter for video_diffusion): + extra_kwargs["noised_start"], extra_kwargs["c_out"], extra_kwargs["alphas_cumprod"] + Return: + predictor_input_list (input for self.predictor in SoraModel.forward): + predictor_input_latent, predictor_timesteps, predictor_prompt, predictor_video_mask, prompt_prompt_mask + training_loss_input_list (input for self.compute_loss in SoraModel.forward): + latents, noised_latents, timesteps, noise, video_mask + """ + predictor_input_latent, latents, predictor_prompt, predictor_timesteps, \ + extra_kwargs["noised_start"], extra_kwargs["c_out"], extra_kwargs["alphas_cumprod"] = input_tensor_list + predictor_video_mask, predictor_prompt_mask = None, None + predictor_input_list = [predictor_input_latent, predictor_timesteps, predictor_prompt, predictor_video_mask, predictor_prompt_mask] + training_loss_input_list = [latents, None, predictor_timesteps, None, predictor_video_mask] + + return predictor_input_list, training_loss_input_list + + def pipeline_set_next_stage_tensor(self, input_list, output_list, extra_kwargs=None): + """ + Process predictor output tensors from curr pipeline stage, and adjust to next pipeline stage + Input: + input_list: [latents, noised_latents, timesteps, noise, video_mask] + output_list: [predictor_output, predictor_prompt, predictor_timesteps] + Return: + predictor_output, latents, predictor_prompt, predictor_timesteps, extra_kwargs["noised_start"], extra_kwargs["c_out"], extra_kwargs["alphas_cumprod"] + """ + return [output_list[0], input_list[0], output_list[1], output_list[2], \ + extra_kwargs["noised_start"], extra_kwargs["c_out"], extra_kwargs["alphas_cumprod"]] + class VideoDiTBlock(nn.Module): """ @@ -462,8 +619,12 @@ class VideoDiTBlock(nn.Module): time_embed_dim=None, text_length=None, pos_embed=None, + patch_size=None ): super().__init__() + self.patch_size = patch_size + args = get_args() + self.sequence_parallel = args.sequence_parallel self.time_embed_dim = time_embed_dim if time_embed_dim is not None else dim self.cross_attention_dim = cross_attention_dim if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None: @@ -487,7 +648,7 @@ class VideoDiTBlock(nn.Module): self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps) self.enable_sequence_parallelism = enable_sequence_parallelism - if self.enable_sequence_parallelism: + if self.enable_sequence_parallelism or self.sequence_parallel: attention = ParallelSelfAttentionSBH else: attention = SelfAttentionBNSD @@ -520,8 +681,8 @@ class VideoDiTBlock(nn.Module): ) # 3. Scale-shift. - args = get_args() config = core_transformer_config_from_args(args) + config.sequence_parallel = False self.scale_shift_table = nn.Sequential( nn.SiLU(), tensor_parallel.ColumnParallelLinear( @@ -529,9 +690,13 @@ class VideoDiTBlock(nn.Module): 12 * dim, config=config, init_method=config.init_method, - gather_output=True + gather_output=False ) ) + for param in self.norm1.parameters(): + setattr(param, "sequence_parallel", self.sequence_parallel) + for param in self.norm2.parameters(): + setattr(param, "sequence_parallel", self.sequence_parallel) # let chunk size default to None self._chunk_size = None @@ -548,13 +713,25 @@ class VideoDiTBlock(nn.Module): frames: torch.int64 = None, height: torch.int64 = None, width: torch.int64 = None, + rope_T: torch.int64 = None, + rope_H: torch.int64 = None, + rope_W: torch.int64 = None, added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, ) -> torch.FloatTensor: # 1. Self-Attention frames = frames.item() height = height.item() width = width.item() - if self.enable_sequence_parallelism: + if self.enable_sequence_parallelism or self.sequence_parallel: + _scale_shift_table = self.scale_shift_table(timestep)[0] + if self.sequence_parallel: + _scale_shift_table = tensor_parallel.mappings.all_gather_last_dim_from_tensor_parallel_region( + _scale_shift_table + ) + else: + _scale_shift_table = tensor_parallel.mappings.gather_from_tensor_model_parallel_region( + _scale_shift_table + ) ( shift_msa, scale_msa, @@ -568,10 +745,14 @@ class VideoDiTBlock(nn.Module): text_shift_mlp, text_scale_mlp, text_gate_mlp, - ) = self.scale_shift_table(timestep)[0].unsqueeze(0).chunk(12, dim=2) + ) = _scale_shift_table.unsqueeze(0).chunk(12, dim=2) latents_text = latents[:self.text_length] latents_vid = latents[self.text_length:] else: + _scale_shift_table = self.scale_shift_table(timestep)[0] + _scale_shift_table = tensor_parallel.mappings.gather_from_tensor_model_parallel_region( + _scale_shift_table + ) ( shift_msa, scale_msa, @@ -585,20 +766,20 @@ class VideoDiTBlock(nn.Module): text_shift_mlp, text_scale_mlp, text_gate_mlp, - ) = self.scale_shift_table(timestep)[0].unsqueeze(1).chunk(12, dim=2) + ) = _scale_shift_table.unsqueeze(1).chunk(12, dim=2) latents_text = latents[:, :self.text_length] latents_vid = latents[:, self.text_length:] latents_vid = self.norm1(latents_vid) latents_text = self.norm1(latents_text) latents_vid = latents_vid * (1 + scale_msa) + shift_msa latents_text = latents_text * (1 + text_scale_msa) + text_shift_msa - if self.enable_sequence_parallelism: + if self.enable_sequence_parallelism or self.sequence_parallel: norm_latents = torch.cat((latents_text, latents_vid), dim=0) # (s_t + t * h/2 * w/2, b, n * d) else: norm_latents = torch.cat((latents_text, latents_vid), dim=1) # (b, s_t + t * h/2 * w/2, n * d) if self.pos_embed is not None and self.positional_embeddings is not None: - norm_latents = self.pos_embed(norm_latents) + norm_latents = self.pos_embed(norm_latents, rope_T=rope_T, rope_H=rope_H, rope_W=rope_W) attn_output = self.self_atten( query=norm_latents, @@ -607,8 +788,11 @@ class VideoDiTBlock(nn.Module): frames=frames, height=height, width=width, + rope_T=rope_T, + rope_H=rope_H, + rope_W=rope_W ) - if self.enable_sequence_parallelism: + if self.enable_sequence_parallelism or self.sequence_parallel: attn_vid_output = gate_msa * attn_output[self.text_length:] attn_text_output = text_gate_msa * attn_output[:self.text_length] attn_output = torch.cat((attn_text_output, attn_vid_output), dim=0) @@ -620,7 +804,7 @@ class VideoDiTBlock(nn.Module): latents = attn_output + latents # 2. Feed-forward - if self.enable_sequence_parallelism: + if self.enable_sequence_parallelism or self.sequence_parallel: latents_text = latents[:self.text_length] latents_vid = latents[self.text_length:] latents_text = self.norm2(latents_text) @@ -639,7 +823,7 @@ class VideoDiTBlock(nn.Module): ff_output = self.ff(norm_latents) - if self.enable_sequence_parallelism: + if self.enable_sequence_parallelism or self.sequence_parallel: ff_vid_output = gate_mlp * ff_output[self.text_length:] ff_text_output = text_gate_mlp * ff_output[:self.text_length] ff_output = torch.cat((ff_text_output, ff_vid_output), dim=0) diff --git a/mindspeed_mm/models/predictor/predict_model.py b/mindspeed_mm/models/predictor/predict_model.py index 83fc7724..f263b62e 100644 --- a/mindspeed_mm/models/predictor/predict_model.py +++ b/mindspeed_mm/models/predictor/predict_model.py @@ -1,5 +1,6 @@ from torch import nn from megatron.training.utils import print_rank_0 +from megatron.core import mpu from mindspeed_mm.models.common.checkpoint import load_checkpoint from .dits import VideoDiT, Latte, STDiT, STDiT3, VideoDitSparse, SatDiT, VideoDitSparseI2V, PTDiT @@ -28,6 +29,7 @@ class PredictModel(nn.Module): def __init__(self, config): super().__init__() model_cls = PREDICTOR_MODEL_MAPPINGS[config.model_id] + config = self._build_predictor_layers_config(config) self.predictor = model_cls(**config.to_dict()) if config.from_pretrained is not None: load_checkpoint(self.predictor, config.from_pretrained) @@ -35,3 +37,29 @@ class PredictModel(nn.Module): def get_model(self): return self.predictor + + def _build_predictor_layers_config(self, config): + if mpu.get_pipeline_model_parallel_world_size() <= 1: + return config + + pp_rank = mpu.get_pipeline_model_parallel_rank() + + if not hasattr(config, "pipeline_num_layers"): + raise ValueError(f"The `pipeline_num_layers` must be specified in the config for pipeline parallel") + if sum(config.pipeline_num_layers) != config.num_layers: + raise ValueError(f"The sum of `pipeline_num_layers` must be equal to the `num_layers`") + + local_num_layers = config.pipeline_num_layers[pp_rank] + if local_num_layers <= 0: + raise ValueError(f"for pp_rank {pp_rank}, the predictor layer is {local_num_layers}, " + f"which is invalid. ") + + pipeline_start_idx = sum(config.pipeline_num_layers[:pp_rank]) + pipeline_end_idx = sum(config.pipeline_num_layers[:pp_rank + 1]) + + config.num_layers = local_num_layers + config.pre_process = mpu.is_pipeline_first_stage() + config.post_process = mpu.is_pipeline_last_stage() + config.global_layer_idx = tuple(range(pipeline_start_idx, pipeline_end_idx)) + + return config \ No newline at end of file diff --git a/mindspeed_mm/models/qwen2vl_model.py b/mindspeed_mm/models/qwen2vl_model.py index 0828165f..28d9a2fc 100644 --- a/mindspeed_mm/models/qwen2vl_model.py +++ b/mindspeed_mm/models/qwen2vl_model.py @@ -271,17 +271,17 @@ class Qwen2VLModel(MultiModalModule): def forward( - self, - input_ids: torch.Tensor, - pixel_values: Optional[torch.Tensor] = None, - image_grid_thw: Optional[torch.Tensor] = None, - attention_mask: Optional[torch.Tensor] = None, - labels: Optional[torch.Tensor] = None, - inference_params: Optional[InferenceParams] = None, - decoder_input: Optional[torch.FloatTensor] = None, - position_ids: Optional[torch.LongTensor] = None, - packed_seq_params: Optional[PackedSeqParams] = None, - extra_block_kwargs: Optional[dict] = None, + self, + input_ids: torch.Tensor, + pixel_values: Optional[torch.Tensor] = None, + image_grid_thw: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + inference_params: Optional[InferenceParams] = None, + decoder_input: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + packed_seq_params: Optional[PackedSeqParams] = None, + extra_block_kwargs: Optional[dict] = None, ) -> Union[Dict[str, torch.Tensor], torch.Tensor]: vit_embeds = None diff --git a/mindspeed_mm/models/sora_model.py b/mindspeed_mm/models/sora_model.py index 86105c94..5482ae7a 100644 --- a/mindspeed_mm/models/sora_model.py +++ b/mindspeed_mm/models/sora_model.py @@ -52,14 +52,13 @@ class SoRAModel(nn.Module): self.config = core_transformer_config_from_args(get_args()) self.task = config.task if hasattr(config, "task") else "t2v" - self.pp_size = mpu.get_pipeline_model_parallel_world_size() if mpu.get_virtual_pipeline_model_parallel_world_size() is not None: raise NotImplementedError("Not support virtual_pipeline_model_parallel now. ") else: self.pp_rank = mpu.get_pipeline_model_parallel_rank() - self.pre_process = True - self.post_process = True + self.pre_process = mpu.is_pipeline_first_stage() + self.post_process = mpu.is_pipeline_last_stage() self.input_tensor = None # to avoid grad all-reduce and reduce-scatter in megatron, since SoRAModel has no embedding layer. self.share_embeddings_and_output_weights = False @@ -77,30 +76,7 @@ class SoRAModel(nn.Module): self.text_encoder.requires_grad_(False) self.diffusion = DiffusionModel(config.diffusion).get_model() - self.predictor = self._build_predictor_layers(config.predictor) - - def _build_predictor_layers(self, config): - self.predictor_cls = config.model_id - if self.pp_size <= 1: - return PredictModel(config).get_model() - - local_num_layers = config.pipeline_num_layers[self.pp_rank] - if local_num_layers <= 0: - raise ValueError(f"for pp_rank {self.pp_rank}, the predictor layer is {local_num_layers}, " - f"which is invalid. ") - - pipeline_start_idx = sum(config.pipeline_num_layers[:self.pp_rank]) - pipeline_end_idx = sum(config.pipeline_num_layers[:self.pp_rank + 1]) - self.pre_process = pipeline_start_idx == 0 - self.post_process = pipeline_end_idx == config.num_layers - - config.num_layers = local_num_layers - config.pre_process, config.post_process = self.pre_process, self.post_process - config.global_layer_idx = tuple(range(pipeline_start_idx, pipeline_end_idx)) - if len(config.global_layer_idx) != local_num_layers: - raise ValueError("The number of global_layer_idx is not equal to local_num_layers") - - return PredictModel(config=config).get_model() + self.predictor = PredictModel(config.predictor).get_model() def set_input_tensor(self, input_tensor): self.input_tensor = input_tensor @@ -160,25 +136,24 @@ class SoRAModel(nn.Module): ) if self.post_process: - timesteps = timesteps.to(torch.int64) loss = self.compute_loss( output if isinstance(output, torch.Tensor) else output[0], latents, noised_latents, timesteps, noise, - video_mask + video_mask, + **kwargs ) return [loss] - timesteps = timesteps.to(torch.bfloat16) return self.predictor.pipeline_set_next_stage_tensor( input_list=[latents, noised_latents, timesteps, noise, video_mask], output_list=output, extra_kwargs=kwargs) def compute_loss( - self, model_output, latents, noised_latents, timesteps, noise, video_mask + self, model_output, latents, noised_latents, timesteps, noise, video_mask, **kwargs ): """compute diffusion loss""" loss_dict = self.diffusion.training_losses( @@ -188,6 +163,7 @@ class SoRAModel(nn.Module): noise=noise, t=timesteps, mask=video_mask, + **kwargs ) return loss_dict diff --git a/mindspeed_mm/tasks/evaluation/eval_datasets/__init__.py b/mindspeed_mm/tasks/evaluation/eval_datasets/__init__.py index d4c67aa9..39bb2ccc 100644 --- a/mindspeed_mm/tasks/evaluation/eval_datasets/__init__.py +++ b/mindspeed_mm/tasks/evaluation/eval_datasets/__init__.py @@ -1,6 +1,6 @@ -from .datasets_base import BaseEvalDataset from .datasets_mmmu import MMMUEvalDataset from .datasets_vqa import VQAEvalDataset +from .datasets_ai2d import AI2DEvalDataset -eval_dataset_dict = {"mmmu_dev_val": MMMUEvalDataset, "ai2d_test": BaseEvalDataset, "chartqa_test": VQAEvalDataset, +eval_dataset_dict = {"mmmu_dev_val": MMMUEvalDataset, "ai2d_test": AI2DEvalDataset, "chartqa_test": VQAEvalDataset, "docvqa_val": VQAEvalDataset} diff --git a/mindspeed_mm/tasks/evaluation/eval_prompt/__init__.py b/mindspeed_mm/tasks/evaluation/eval_prompt/__init__.py index db378600..c625c440 100644 --- a/mindspeed_mm/tasks/evaluation/eval_prompt/__init__.py +++ b/mindspeed_mm/tasks/evaluation/eval_prompt/__init__.py @@ -1,4 +1,3 @@ -from .build_prompt_base import BasePromptTemplate from .build_prompt_llava import LlavaPromptTemplate from .build_prompt_internvl import InternvlPromptTemplate from .build_prompt_qwen2vl import Qwen2vlPromptTemplate diff --git a/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_base.py b/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_base.py index 1eaff983..6d02b344 100644 --- a/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_base.py +++ b/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_base.py @@ -1,3 +1,4 @@ +from typing import Callable import string import torch @@ -12,6 +13,9 @@ class BasePromptTemplate: def __init__(self): self.device = torch.cuda.current_device() + def build_prompt(self, line, dump_image: Callable, dataset_name=None): + raise NotImplementedError('you must implement build_prompt') + @staticmethod def check_content_type(message): """Check the content type of the input. Four types are allowed: str, dict, ListOfString, ListOfDict. diff --git a/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_internvl.py b/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_internvl.py index bd8179a6..b759553c 100644 --- a/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_internvl.py +++ b/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_internvl.py @@ -3,9 +3,6 @@ from typing import Callable from mindspeed_mm.tasks.evaluation.eval_prompt.build_prompt_base import BasePromptTemplate from mindspeed_mm.tasks.evaluation.eval_datasets.datasets_base import datasets_type -IMAGENET_MEAN = (0.485, 0.456, 0.406) -IMAGENET_STD = (0.229, 0.224, 0.225) - class InternvlPromptTemplate(BasePromptTemplate): diff --git a/mindspeed_mm/tasks/inference/pipeline/cogvideox_pipeline.py b/mindspeed_mm/tasks/inference/pipeline/cogvideox_pipeline.py index 7af286b7..67a2d785 100644 --- a/mindspeed_mm/tasks/inference/pipeline/cogvideox_pipeline.py +++ b/mindspeed_mm/tasks/inference/pipeline/cogvideox_pipeline.py @@ -48,7 +48,6 @@ class CogVideoXPipeline(MMPipeline, InputsCheckMixin, MMEncoderMixin): self.num_frames, self.height, self.width = config.get("input_size", [49, 480, 720]) self.generator = torch.Generator().manual_seed(config.get("seed", 42)) self.num_videos_per_prompt = 1 - self.max_sequence_length = 226 self.guidance_scale = config.get("guidance_scale", 6.0) self.scheduler.use_dynamic_cfg = config.get("use_dynamic_cfg", True) @@ -105,10 +104,7 @@ class CogVideoXPipeline(MMPipeline, InputsCheckMixin, MMEncoderMixin): negative_prompt_embeds: Optional[torch.FloatTensor] = None, **kwargs ): - if self.num_frames > 49: - raise ValueError( - "The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation." - ) + self.max_sequence_length = kwargs.pop("max_sequence_length", 226) height = self.height or self.predict_model.config.sample_size * self.vae_scale_factor_spatial width = self.width or self.predict_model.config.sample_size * self.vae_scale_factor_spatial diff --git a/mindspeed_mm/tools/README.md b/mindspeed_mm/tools/README.md index 2a7db53b..9115335e 100644 --- a/mindspeed_mm/tools/README.md +++ b/mindspeed_mm/tools/README.md @@ -55,6 +55,7 @@ prof.stop() --start_step # 设置启动采集的步数 --end_step # 设置结束采集的步数 --data_simplification # 采集时是否采用简化数据 + --aic_metrics_type # 采集模式,目前支持PipeUtilization和ArithmeticUtilization两种,默认采用PipeUtilization ``` 3. 运行模型并采集profiling文件 diff --git a/mindspeed_mm/tools/profiler.py b/mindspeed_mm/tools/profiler.py index 0f24b593..e69c0507 100644 --- a/mindspeed_mm/tools/profiler.py +++ b/mindspeed_mm/tools/profiler.py @@ -1,4 +1,5 @@ -# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# coding=utf-8 +# Copyright (c) 2024, HUAWEI CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -64,7 +65,9 @@ class Profiler: self.sp_data_simplification = config.static_param.data_simplification self.dp_config_path = config.dynamic_param.config_path - + + self.aic_metrics_type = config.static_param.aic_metrics_type + if self.profile_type == "static": if self.sp_level == 'level0': profiler_level = torch_npu.profiler.ProfilerLevel.Level0 @@ -75,9 +78,14 @@ class Profiler: else: raise ValueError(f"profiler_level only supports level0," f" 1, and 2, but gets {self.sp_level}") - + if self.aic_metrics_type == 'PipeUtilization': + aic_metrics_type = torch_npu.profiler.AiCMetrics.PipeUtilization + elif self.aic_metrics_type == 'ArithmeticUtilization': + aic_metrics_type = torch_npu.profiler.AiCMetrics.ArithmeticUtilization + else: + raise ValueError(f"aic_metrics_type only supports PipeUtilization and ArithmeticUtilization") experimental_config = torch_npu.profiler._ExperimentalConfig( - aic_metrics=torch_npu.profiler.AiCMetrics.ArithmeticUtilization, + aic_metrics=aic_metrics_type, profiler_level=profiler_level, data_simplification=self.sp_data_simplification, ) diff --git a/mindspeed_mm/tools/tools.json b/mindspeed_mm/tools/tools.json index caf4bd02..0858b403 100644 --- a/mindspeed_mm/tools/tools.json +++ b/mindspeed_mm/tools/tools.json @@ -12,7 +12,8 @@ "save_path": "./npu_profiling", "start_step": 10, "end_step": 11, - "data_simplification": false + "data_simplification": false, + "aic_metrics_type": "PipeUtilization" }, "dynamic_param": { "config_path": "path to dynamic config folder" diff --git a/redis_utils.py b/redis_utils.py deleted file mode 100644 index 3d6eb621..00000000 --- a/redis_utils.py +++ /dev/null @@ -1,86 +0,0 @@ -import redis -import json -import time - -from concurrent.futures import ThreadPoolExecutor - - -class RedisManager: - def __init__(self, host, port, db, model_name, ping_interval=10): - self.conn_pool = redis.ConnectionPool(host=host, port=int(port), db=db) - self.last_ping_time = time.time() - self.model_name = model_name - self.lock_prefix = 'lock:' - self.ping_interval = ping_interval - - def _ping(self): - current_time = time.time() - if current_time - self.last_ping_time > self.ping_interval: - redis_conn = redis.Redis(connection_pool=self.conn_pool) - redis_conn.ping() - redis_conn.close() - self.last_ping_time = current_time - - def write_data_to_result(self, model_name, message_id, data): - while True: - try: - self._ping() - break - except Exception as _: - continue - r = redis.Redis(connection_pool=self.conn_pool) - if message_id != "dummy": - r.hset(model_name + "_result", message_id, json.dumps(data)) - r.close() - - def fetch_single_request_without_queue(self): - r = redis.Redis(connection_pool=self.conn_pool) - - request_data = None - - # Directly access the fixed hash - message_ids = r.hkeys(self.model_name) - for message_id in message_ids: - lock_key = self.lock_prefix + message_id.decode("utf-8") - # Try to acquire a lock for the field - if r.setnx(lock_key, 1): # If successfully set, we have acquired the lock - r.expire(lock_key, 10) # Set an expiration time for the lock to avoid deadlocks - try: - result = r.hget(self.model_name, message_id) - if result: - request_data = json.loads(result.decode("utf-8")) - request_data["main_key"] = self.model_name - request_data["message_id"] = message_id.decode("utf-8") - # Delete the field after retrieval - r.hdel(self.model_name, message_id) - finally: - # Release the lock - r.delete(lock_key) - break - - r.close() - - return request_data - - def fetch_batch_of_requests(self, batch_size=4): - while True: - try: - self._ping() - break - except Exception as e: - print("Fail to connect Redis, retry") - continue - - batch = [] - - with ThreadPoolExecutor(max_workers=batch_size) as executor: - futures = [ - executor.submit(self.fetch_single_request_without_queue) for _ in range(batch_size) - ] - - for future in futures: - result = future.result() - if result: - batch.append(result) - - return batch diff --git a/tests/st/shell_scripts/finetune_internvl2_8B.sh b/tests/st/shell_scripts/finetune_internvl2_8B.sh index 8328cb77..567c957d 100644 --- a/tests/st/shell_scripts/finetune_internvl2_8B.sh +++ b/tests/st/shell_scripts/finetune_internvl2_8B.sh @@ -6,15 +6,14 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export CUDA_DEVICE_MAX_CONNECTIONS=1 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 -GPUS_PER_NODE=8 +NPUS_PER_NODE=8 MASTER_ADDR=localhost MASTER_PORT=6000 NNODES=1 NODE_RANK=0 -WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) MBS=1 GRAD_ACC_STEP=64 @@ -38,7 +37,7 @@ MM_ARGS=" " DISTRIBUTED_ARGS=" - --nproc_per_node $GPUS_PER_NODE \ + --nproc_per_node $NPUS_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ diff --git a/tests/st/shell_scripts/finetune_qwen2vl_7B.sh b/tests/st/shell_scripts/finetune_qwen2vl_7B.sh index 56f58202..a8fe877b 100644 --- a/tests/st/shell_scripts/finetune_qwen2vl_7B.sh +++ b/tests/st/shell_scripts/finetune_qwen2vl_7B.sh @@ -8,7 +8,6 @@ export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export NPU_ASD_ENABLE=0 export ASCEND_LAUNCH_BLOCKING=0 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 NPUS_PER_NODE=8 diff --git a/tests/st/shell_scripts/inference_qwen2vl_7b_pp1.sh b/tests/st/shell_scripts/inference_qwen2vl_7b_pp1.sh index db0193e6..990a1e9c 100644 --- a/tests/st/shell_scripts/inference_qwen2vl_7b_pp1.sh +++ b/tests/st/shell_scripts/inference_qwen2vl_7b_pp1.sh @@ -1,8 +1,6 @@ #!/bin/bash source /usr/local/Ascend/ascend-toolkit/set_env.sh -# 通过此配置选择使用的NPU卡 -# export ASCEND_RT_VISIBLE_DEVICES=1 export CUDA_DEVICE_MAX_CONNECTIONS=1 export ASCEND_SLOG_PRINT_TO_STDOUT=0 export ASCEND_GLOBAL_LOG_LEVEL=3 @@ -12,7 +10,6 @@ export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export NPU_ASD_ENABLE=0 export ASCEND_LAUNCH_BLOCKING=0 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True diff --git a/tests/st/shell_scripts/inference_qwen2vl_7b_pp4.sh b/tests/st/shell_scripts/inference_qwen2vl_7b_pp4.sh index 023bbf5b..a7bf929e 100644 --- a/tests/st/shell_scripts/inference_qwen2vl_7b_pp4.sh +++ b/tests/st/shell_scripts/inference_qwen2vl_7b_pp4.sh @@ -1,8 +1,6 @@ #!/bin/bash source /usr/local/Ascend/ascend-toolkit/set_env.sh -# 通过此配置选择使用的NPU卡 -# export ASCEND_RT_VISIBLE_DEVICES=1 export CUDA_DEVICE_MAX_CONNECTIONS=1 export ASCEND_SLOG_PRINT_TO_STDOUT=0 export ASCEND_GLOBAL_LOG_LEVEL=3 @@ -12,7 +10,6 @@ export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 export NPU_ASD_ENABLE=0 export ASCEND_LAUNCH_BLOCKING=0 -export HOST_CACHE_CAPACITY=20 export ACLNN_CACHE_LIMIT=100000 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True diff --git a/tests/st/shell_scripts/pretrain_llava1_5.sh b/tests/st/shell_scripts/pretrain_llava1_5.sh index 82b900ec..7a50120d 100644 --- a/tests/st/shell_scripts/pretrain_llava1_5.sh +++ b/tests/st/shell_scripts/pretrain_llava1_5.sh @@ -7,12 +7,12 @@ export COMBINED_ENABLE=1 export CPU_AFFINITY_CONF=1 export HCCL_CONNECT_TIMEOUT=1200 -GPUS_PER_NODE=4 +NPUS_PER_NODE=4 MASTER_ADDR=localhost MASTER_PORT=6000 NNODES=1 NODE_RANK=0 -WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES)) +WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES)) TP=1 PP=1 @@ -27,7 +27,7 @@ MM_MODEL="$BASEPATH/tests/st/run_configs/pretrain_llava1_5/model.json" MM_TOOL="$BASEPATH/mindspeed_mm/tools/tools.json" DISTRIBUTED_ARGS=" - --nproc_per_node $GPUS_PER_NODE \ + --nproc_per_node $NPUS_PER_NODE \ --nnodes $NNODES \ --node_rank $NODE_RANK \ --master_addr $MASTER_ADDR \ diff --git a/tests/ut/models/common/embeddings/test_cogvideox_pos_emb.py b/tests/ut/models/common/embeddings/test_cogvideox_pos_emb.py new file mode 100644 index 00000000..060a4cf7 --- /dev/null +++ b/tests/ut/models/common/embeddings/test_cogvideox_pos_emb.py @@ -0,0 +1,10 @@ +import mindspeed.megatron_adaptor + +from mindspeed_mm.models.common.embeddings.pos_embeddings import Rotary3DPositionEmbedding +from tests.ut.utils import judge_expression + + +class TestCogVideoXRope: + def test_init_rope_1_0_t2v(self): + rope = Rotary3DPositionEmbedding(30, 45, 1, 3072, 64, 226, learnable_pos_embed=False) + judge_expression(isinstance(rope, Rotary3DPositionEmbedding)) diff --git a/tests/ut/models/diffusion/test_cogvideo_diffusion.py b/tests/ut/models/diffusion/test_cogvideo_diffusion.py new file mode 100644 index 00000000..2fb7d3d0 --- /dev/null +++ b/tests/ut/models/diffusion/test_cogvideo_diffusion.py @@ -0,0 +1,121 @@ +import torch +import pytest +import numpy as np + +from mindspeed_mm.models.diffusion.cogvideo_diffusion import append_dims, default, append_zero, \ + generate_roughly_equally_spaced_steps, EpsWeighting, make_beta_schedule +from tests.ut.utils import judge_expression + + +class TestCogvideoDiffusion: + + def test_append_dims_no_append_needed(self): + """Test when no dimensions need to be appended.""" + x = torch.tensor([1, 2, 3]) + result = append_dims(x, 1) + judge_expression(result.shape == (3,)) + judge_expression(torch.equal(result, x)) + + def test_append_dims_append_one_dim(self): + """Test appending one dimension.""" + x = torch.tensor([1, 2, 3]) + result = append_dims(x, 2) + expected_shape = (3, 1) + judge_expression(result.shape == expected_shape) + + def test_append_dims_append_multiple_dims(self): + """Test appending multiple dimensions.""" + x = torch.tensor([1, 2, 3]) + target_dims = 5 + result = append_dims(x, target_dims) + expected_shape = (3,) + (1,) * (target_dims - x.ndim) + judge_expression(result.shape == expected_shape) + + def test_append_dims_already_higher_dims(self): + """Test when the input already has more dimensions than target_dims.""" + x = torch.rand(2, 3, 4) + with pytest.raises(ValueError): + append_dims(x, 2) + + def test_append_dims_target_dims_equal_input_dims(self): + """Test when target_dims is equal to the number of dimensions in the input.""" + x = torch.rand(2, 3, 4) + result = append_dims(x, 3) + judge_expression(result.shape == (2, 3, 4)) + judge_expression(torch.equal(result, x)) + + def test_default_val_not_none(self): + """Test when val is not None.""" + result = default(5, lambda: 10) + judge_expression(result == 5) + + def test_default_val_none_d_is_function(self): + """Test when val is None and d is a function.""" + def func(): + return "default_value" + + result = default(None, func) + judge_expression(result == "default_value") + + def test_default_val_none_d_is_none(self): + """Test when both val and d are None.""" + result = default(None, None) + judge_expression(result is None) + + def test_append_zero_regular_tensor(self): + """Test appending zero to a regular tensor.""" + x = torch.tensor([1, 2, 3]) + result = append_zero(x) + expected = torch.tensor([1, 2, 3, 0]) + judge_expression(torch.equal(result, expected)) + + def test_append_zero_multidimensional_tensor(self): + """Test appending zero to a multidimensional tensor.""" + x = torch.tensor([[1, 2], [3, 4]]) + with pytest.raises(RuntimeError): + append_zero(x) + + def test_append_zero_empty_tensor(self): + """Test appending zero to an empty tensor.""" + x = torch.tensor([]) + result = append_zero(x) + expected = torch.tensor([0]) + judge_expression(torch.equal(result, expected)) + + def test_generate_roughly_equally_spaced_steps_case(self): + """Test with a normal case.""" + num_substeps = 5 + max_step = 10 + result = generate_roughly_equally_spaced_steps(num_substeps, max_step) + expected = np.array([1, 3, 5, 7, 9]) + judge_expression(np.array_equal(result, expected)) + + def test_eps_weighting_positive_input(self): + """Test with a positive input.""" + weighting = EpsWeighting() + sigma = 2.0 + result = weighting(sigma) + expected = sigma ** -2.0 + judge_expression(np.isclose(result, expected)) + + def test_make_beta_schedule_linear(self): + """Test with linear schedule.""" + n_timestep = 10 + betas = make_beta_schedule("linear", n_timestep) + expected = np.linspace(1e-4 ** 0.5, 2e-2 ** 0.5, n_timestep) ** 2 + judge_expression(np.allclose(betas, expected)) + + def test_make_beta_schedule_custom_params(self): + """Test with custom linear_start and linear_end.""" + n_timestep = 10 + linear_start = 1e-3 + linear_end = 5e-2 + betas = make_beta_schedule("linear", n_timestep, linear_start, linear_end) + expected = np.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep) ** 2 + judge_expression(np.allclose(betas, expected)) + + def test_make_beta_schedule_no_linear(self): + """Test with no linear schedule.""" + n_timestep = 10 + with pytest.raises(NotImplementedError): + make_beta_schedule("cosine", n_timestep) -- Gitee From 51d432641fa9f8db5efe12bc55f1a37f075cb6e3 Mon Sep 17 00:00:00 2001 From: Luo Yiyang Date: Mon, 30 Dec 2024 17:21:53 +0800 Subject: [PATCH 4/4] fix conflict --- .../i2v_1.0/model_cogvideox_i2v.json | 128 ++++++++++++++++++ .../t2v_1.0/model_cogvideox_t2v.json | 1 - .../models/ae/contextparallelcausalvae.py | 2 +- 3 files changed, 129 insertions(+), 2 deletions(-) diff --git a/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json b/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json index e69de29b..56aa80ad 100644 --- a/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json +++ b/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json @@ -0,0 +1,128 @@ +{ + "frames": 25, + "resolution": [480, 720], + "allow_tf32": true, + "allow_internal_format":false, + "load_video_features": false, + "load_text_features": false, + "task": "i2v", + "predictor": { + "model_id": "satdit", + "from_pretrained": null, + "dtype": "bf16", + "num_layers": 42, + "num_heads": 48, + "head_dim": 64, + "in_channels": 32, + "out_channels": 16, + "dropout": 0.0, + "cross_attention_dim": null, + "attention_bias": true, + "input_size": [7, 60, 90], + "patch_size": [1, 2, 2], + "activation_fn": "gelu-approximate", + "num_embeds_ada_norm": 1000, + "norm_type": "qk_ln", + "norm_elementwise_affine": true, + "norm_eps": 1e-5, + "caption_channels": null, + "time_embed_dim": 512, + "text_length": 226, + "text_hidden_size": 4096, + "concat_text_embed": true, + "interpolation_scale": [1.0, 1.0, 1.0], + "learnable_pos_embed": true, + "use_rope": true + }, + "diffusion": { + "model_id": "cogvideo_diffusion", + "sigma_sampler_config": { + "uniform_sampling": true, + "num_idx": 1000, + "discretization_config":{ + "shift_scale": 1.0 + } + }, + "denoiser_config": { + "num_idx": 1000, + "quantize_c_noise": false, + "discretization_config":{ + "shift_scale": 1.0 + } + } + }, + "text_encoder": { + "model_id": "T5", + "hub_backend": "hf", + "from_pretrained": "5b-cogvideo", + "dtype": "bf16", + "load_in_8bit": false, + "low_cpu_mem_usage": true, + "ucg_rate": 0.1 + }, + "ae": { + "model_id": "contextparallelcasualvae", + "from_pretrained": "3d-vae.pt", + "cp_size": 1, + "dtype": "bf16", + "z_channels": 16, + "conv_padding": 0, + "num_res_blocks": 3, + "hidden_size_mult": [1,2,2,4], + "encoder_attention": "", + "encoder_nonlinearity": "swish", + "encoder_conv_in": "ContextParallelCausalConv3d", + "encoder_conv_out": "ContextParallelCausalConv3d", + "encoder_mid_resnet": "ContextParallelResnetBlock3D", + "encoder_resnet_blocks": [ + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D" + ], + "encoder_spatial_downsample": [ + "DownSample3D", + "DownSample3D", + "DownSample3D", + "" + ], + "encoder_temporal_downsample": [ + "", + "", + "", + "" + ], + "decoder_attention": "", + "decoder_nonlinearity": "swish", + "decoder_conv_in": "ContextParallelCausalConv3d", + "decoder_conv_out": "ContextParallelCausalConv3d", + "decoder_mid_resnet": "ContextParallelResnetBlock3D", + "decoder_resnet_blocks": [ + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D", + "ContextParallelResnetBlock3D" + ], + "decoder_spatial_upsample": [ + "", + "Upsample3D", + "Upsample3D", + "Upsample3D" + ], + "decoder_temporal_upsample": [ + "", + "", + "", + "" + ], + "encoder_gather_norm": true, + "decoder_gather_norm": true, + "use_quant_layer": false, + "i2v_processor": { + "processor_id": "cogvideox_i2v_processor", + "noised_image_all_concat": false, + "noised_image_dropout": 0.05, + "noised_image_input": true + } + } +} \ No newline at end of file diff --git a/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json b/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json index cfe12471..367f3f06 100644 --- a/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json +++ b/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json @@ -37,7 +37,6 @@ "model_id": "cogvideo_diffusion", "sigma_sampler_config": { "uniform_sampling": true, - "group_num": 8, "num_idx": 1000, "discretization_config":{ "shift_scale": 1.0 diff --git a/mindspeed_mm/models/ae/contextparallelcausalvae.py b/mindspeed_mm/models/ae/contextparallelcausalvae.py index bad49ca0..7716f483 100644 --- a/mindspeed_mm/models/ae/contextparallelcausalvae.py +++ b/mindspeed_mm/models/ae/contextparallelcausalvae.py @@ -773,7 +773,7 @@ class Decoder(nn.Module): for i_block in range(self.num_res_blocks + 1): h = self.up[i_level].block[i_block](h, zq=zq, enable_cp=enable_cp) if len(self.up[i_level].attn) > 0: - h = self.up[i_level].attn[i_block](h, zq=zq, enable_cp=enable_cp) + h = self.up[i_level].attn[i_block](h, zq=zq) if hasattr(self.up[i_level], "upsample"): h = self.up[i_level].upsample(h, enable_cp=enable_cp) if hasattr(self.up[i_level], "time_upsample"): -- Gitee
MindSpeed-MM模型列表
多模态生成 多模态生成 OpenSora 1.0 5.5B 预训练 【Pass】
CogVideoX-T2V5BCogVideoX-T2V 5B 预训练 1x8 BF16 / / 0.37 (SPS) 0.46 (SPS) 【Pass】
CogVideoX-I2V5B 亲和场景 预训练 1x8 BF16 / / 0.92 (SPS) 0.96 (SPS) 【Pass】
CogVideoX-I2V 5B 预训练 1x8 BF16 0.37 (SPS) 0.46 (SPS) 【Pass】
亲和场景 预训练 1x8 BF16 0.92 (SPS) 0.96 (SPS) 【Pass】
【Pass】
76B76B 全参微调 8x16 BF16