From c700c8bb23c9f795664fbe00f6a33920f179bfdf Mon Sep 17 00:00:00 2001
From: Lawrence <luoyiyang2@huawei.com>
Date: Wed, 25 Dec 2024 11:25:59 +0800
Subject: [PATCH 1/4] init update: support no image input

---
 .gitignore                                    |  8 +-
 examples/qwen2vl/data_7b.json                 |  2 +-
 examples/qwen2vl/finetune_qwen2vl_7b.sh       | 14 +--
 examples/qwen2vl/inference_qwen2vl_7b.json    |  9 +-
 examples/qwen2vl/inference_qwen2vl_7b.sh      |  6 +-
 .../llava_instruct_2_mllm_demo_format.py      | 12 ++-
 .../qwen2vl/qwen2vl_convert_to_mm_ckpt.py     |  8 +-
 inference_vlm.py                              | 97 ++++++++++++++++++-
 mindspeed_mm/models/qwen2vl_model.py          | 39 ++++----
 .../inference/pipeline/qwen2vl_pipeline.py    | 13 ++-
 pretrain_qwen2vl.py                           | 24 +++--
 redis_utils.py                                | 86 ++++++++++++++++
 12 files changed, 261 insertions(+), 57 deletions(-)
 create mode 100644 redis_utils.py

diff --git a/.gitignore b/.gitignore
index 7aa4dbb4..03b38668 100644
--- a/.gitignore
+++ b/.gitignore
@@ -157,4 +157,10 @@ cython_debug/
 /tests/st/run_jsons/
 /tests/st/run_logs/
 
-fusion_result.json
\ No newline at end of file
+fusion_result.json
+
+MindSpeed/
+ckpt/
+data/
+dependencies/
+save_dir*/
\ No newline at end of file
diff --git a/examples/qwen2vl/data_7b.json b/examples/qwen2vl/data_7b.json
index 39b7869f..09cbfd5c 100644
--- a/examples/qwen2vl/data_7b.json
+++ b/examples/qwen2vl/data_7b.json
@@ -25,7 +25,7 @@
         },
         "attr": {
             "system": null,
-            "images": "images",
+            "images": null,
             "videos": null,
             "messages": "messages",
             "role_tag": "role",
diff --git a/examples/qwen2vl/finetune_qwen2vl_7b.sh b/examples/qwen2vl/finetune_qwen2vl_7b.sh
index b65e8ea7..a271be79 100644
--- a/examples/qwen2vl/finetune_qwen2vl_7b.sh
+++ b/examples/qwen2vl/finetune_qwen2vl_7b.sh
@@ -24,8 +24,10 @@ WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 MM_DATA="./examples/qwen2vl/data_7b.json"
 MM_MODEL="./examples/qwen2vl/model_7b.json"
 MM_TOOL="./mindspeed_mm/tools/tools.json"
-LOAD_PATH="ckpt/Qwen2-VL-7B-Instruct"
-SAVE_PATH="save_dir"
+LOAD_PATH="./ckpt/mm_path/Qwen2-VL-7B-Instruct"
+# timestamp=$(date +"%Y%m%d_%H%M%S")
+# SAVE_PATH=$("save_dir/" + $timestamp)
+SAVE_PATH="save_dir/"
 
 TP=1
 PP=4
@@ -68,7 +70,7 @@ GPT_ARGS="
     --lr 1.0e-5 \
     --lr-decay-style cosine \
     --weight-decay 0 \
-    --train-iters 10000 \
+    --train-iters 100 \
     --lr-warmup-fraction 0.1 \
     --clip-grad 0.0 \
     --adam-beta1 0.9 \
@@ -95,9 +97,9 @@ MM_ARGS="
 
 OUTPUT_ARGS="
     --log-interval 1 \
-    --save-interval 10000 \
-    --eval-interval 10000 \
-    --eval-iters 5000 \
+    --save-interval 100 \
+    --eval-interval 100 \
+    --eval-iters 100 \
     --save $SAVE_PATH \
 "
 logfile=$(date +%Y%m%d)_$(date +%H%M%S)
diff --git a/examples/qwen2vl/inference_qwen2vl_7b.json b/examples/qwen2vl/inference_qwen2vl_7b.json
index b357697a..a0120a93 100644
--- a/examples/qwen2vl/inference_qwen2vl_7b.json
+++ b/examples/qwen2vl/inference_qwen2vl_7b.json
@@ -85,7 +85,7 @@
     "tokenizer": {
         "hub_backend": "hf",
         "autotokenizer_name": "AutoTokenizer",
-        "from_pretrained": "ckpt/Qwen2-VL-7B-Instruct",
+        "from_pretrained": "ckpt/hf_path/Qwen2-VL-7B-Instruct",
         "local_files_only":false
     },
     "generation_config": {
@@ -153,7 +153,8 @@
         "max_matching_ngram_size": null,
         "stop_strings": null
     },
-    "image_processer_path": "ckpt/Qwen2-VL-7B-Instruct/preprocessor_config.json",
-    "image_path": "examples/qwen2vl/demo.jpeg",
-    "prompts": "Describe this image and keep it within 100 words."
+    "image_processer_path": "ckpt/hf_path/Qwen2-VL-7B-Instruct/preprocessor_config.json",
+    "image_path": "",
+    "prompts": "Describe this image and keep it within 100 words.",
+    "temperature": 0
 }
diff --git a/examples/qwen2vl/inference_qwen2vl_7b.sh b/examples/qwen2vl/inference_qwen2vl_7b.sh
index 08096fa9..f5b64867 100644
--- a/examples/qwen2vl/inference_qwen2vl_7b.sh
+++ b/examples/qwen2vl/inference_qwen2vl_7b.sh
@@ -16,7 +16,7 @@ export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 
-NPUS_PER_NODE=1
+NPUS_PER_NODE=4
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
@@ -24,10 +24,10 @@ NODE_RANK=0
 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 MM_MODEL="./examples/qwen2vl/inference_qwen2vl_7b.json"
-LOAD_PATH="ckpt/Qwen2-VL-7B-Instruct"
+LOAD_PATH="/home/ma-user/work/MindSpeed-MM/save_dir/"
 
 TP=1
-PP=1
+PP=4
 CP=1
 SEQ_LEN=1024
 MBS=1
diff --git a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py
index 90e43835..4bbd5a75 100644
--- a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py
+++ b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py
@@ -2,7 +2,8 @@ import json
 import os
 import stat
 
-llava_json_path = "./data/llava_instruct_150k.json"
+# llava_json_path = "./data/llava_instruct_150k.json"
+llava_json_path = "./data/llava_instruct_150k_wo_img.json"
 mllm_format_json_path = "./data/mllm_format_llava_instruct_data.json"
 
 with open(llava_json_path, "r") as f:
@@ -10,12 +11,13 @@ with open(llava_json_path, "r") as f:
 
 mllm_format_llava_instruct_data = []
 for item in info_json:
-    img_path = os.path.join("./data/COCO2017/train2017", item["image"])
-    print(f"img_path: {img_path}")
+    # img_path = os.path.join("./data/COCO2017/train2017", item["image"])
+    img_path = os.path.join("./data/dummy", item["image"])
     if not os.path.exists(img_path):
         continue
     new_item = {
         "images": [img_path],
+        # "images": "",
         "messages": []
     }
 
@@ -29,6 +31,8 @@ for item in info_json:
     mllm_format_llava_instruct_data.append(new_item)
 
 output_json = json.dumps(mllm_format_llava_instruct_data)
+if os.path.exists(mllm_format_json_path):
+    os.remove(mllm_format_json_path)
 with os.fdopen(os.open(mllm_format_json_path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, stat.S_IWUSR | stat.S_IRUSR), "w") as f:
     f.write(output_json)
-print("finish converting dataset")
+print(f"finish converting dataset into {mllm_format_json_path}")
diff --git a/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py b/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py
index 4daf73dd..b85e6c5f 100644
--- a/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py
+++ b/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py
@@ -305,9 +305,11 @@ def save_by_pp(_state_dicts, _save_dir, _lastest_checkpointed_iteration='release
 
 
 if __name__ == "__main__":
-    hg_ckpt_dir = "Qwen2-VL-7B-Instruct"
-    mm_save_dir = 'ckpt/Qwen2-VL-7B-Instruct'
-    
+    # hg_ckpt_dir = "Qwen2-VL-7B-Instruct"
+    # mm_save_dir = 'ckpt/Qwen2-VL-7B-Instruct'
+    hg_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-7B-Instruct'  # huggingface权重目录
+    mm_save_dir = 'ckpt/mm_path/Qwen2-VL-7B-Instruct'  # 转换后保存目录
+
     vit_hidden_size = 1280
     vit_attention_heads_num = 16
 
diff --git a/inference_vlm.py b/inference_vlm.py
index d168bfd1..4024f046 100644
--- a/inference_vlm.py
+++ b/inference_vlm.py
@@ -1,26 +1,113 @@
+import os
 import torch
+from time import time, sleep
+from concurrent.futures import ThreadPoolExecutor
 
 import mindspeed.megatron_adaptor
 from megatron.training import get_args
 from mindspeed_mm.tasks.inference.pipeline import vlm_pipeline_dict
 from mindspeed_mm.configs.config import mm_extra_args_provider
 
+from redis_utils import RedisManager
+
+
+def inference_and_write(batch_requests, pipeline, redis: RedisManager, model_name: str):
+    # Extract input data
+    sys_prompts = [req["sys_prompt"] for req in batch_requests]
+    queries = [req["query"] for req in batch_requests]
+    main_ids = [req["message_id"] for req in batch_requests]
+
+    full_queries = sys_prompts[0] + queries[0]
+
+    start_time = time()
+    outputs = pipeline(prompt=full_queries, return_ids=True)
+    print(f"Inference time: {time() - start_time}")
+
+    if not isinstance(outputs, list):
+        outputs = [outputs]
+    print(outputs)
+
+    def write_to_redis(message_id, output):
+        while True:
+            try:
+                redis.write_data_to_result(model_name, message_id, {"output": output})
+                break
+            except Exception as e:
+                continue
+
+    # Multithreaded writing to Redis
+    with ThreadPoolExecutor() as executor:
+        futures = [
+            executor.submit(write_to_redis, message_id, output)
+            for message_id, output in zip(main_ids, outputs)
+        ]
+
+        # Wait until all futures are completed
+        for future in futures:
+            future.result()
+
 
 def main():
     from megatron.training.initialize import initialize_megatron
     from mindspeed_mm.configs.config import merge_mm_args
 
-    # just inference
     torch.set_grad_enabled(False)
 
     initialize_megatron(
-        extra_args_provider=mm_extra_args_provider, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}
+        extra_args_provider=mm_extra_args_provider,
+        args_defaults={"tokenizer_type": "GPT2BPETokenizer"},
     )
     args = get_args()
     merge_mm_args(args)
     inference_config = args.mm.model
-    vlm_pipeline_dict[inference_config.pipeline_class](inference_config)()
+
+    pipeline = vlm_pipeline_dict[inference_config.pipeline_class](inference_config)
+
+    # Redis
+    assert os.environ[
+        "MODEL_NAME"
+    ], "Environment variable MODEL_NAME was not set. Please set it manually."
+    model_name = os.environ["MODEL_NAME"]
+    assert os.environ[
+        "REDIS_URL"
+    ], "Environment variable REDIS_URL was not set. Please set it manually."
+    redis_url = os.environ["REDIS_URL"]
+    assert os.environ[
+        "REDIS_PORT"
+    ], "Environment variable REDIS_PORT was not set. Please set it manually."
+    redis_port = os.environ["REDIS_PORT"]
+    assert os.environ[
+        "REDIS_DB"
+    ], "Environment variable REDIS_DB was not set. Please set it manually."
+    redis_db = os.environ["REDIS_DB"]
+    redis = RedisManager(
+        host=redis_url, port=redis_port, db=redis_db, model_name=model_name
+    )
+
+    batch_size = 1  # Define your batch size
+
+    while True:
+        try:
+            batch_requests = redis.fetch_batch_of_requests(batch_size)
+        except Exception as e:
+            continue
+
+        if batch_requests:
+            # Padding to batch size
+            while len(batch_requests) < batch_size:
+                batch_requests.append(
+                    {
+                        "query": "",
+                        "sys_prompt": "",
+                        "message_id": "dummy",
+                    }
+                )
+
+            inference_and_write(batch_requests, pipeline, redis, model_name)
+        else:
+            sleep(0.01)
 
 
-if __name__ == '__main__':
-    main()
\ No newline at end of file
+if __name__ == "__main__":
+    with torch.inference_mode():
+        main()
diff --git a/mindspeed_mm/models/qwen2vl_model.py b/mindspeed_mm/models/qwen2vl_model.py
index 62263777..0828165f 100644
--- a/mindspeed_mm/models/qwen2vl_model.py
+++ b/mindspeed_mm/models/qwen2vl_model.py
@@ -269,24 +269,26 @@ class Qwen2VLModel(MultiModalModule):
                 causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(padding_mask, min_dtype)
             return causal_mask < 0
 
+
     def forward(
-            self,
-            input_ids: torch.Tensor,
-            pixel_values: torch.Tensor,
-            image_grid_thw: torch.Tensor,
-            attention_mask: torch.Tensor,
-            labels: Optional[torch.Tensor] = None,
-            inference_params: Optional[InferenceParams] = None,
-            decoder_input: Optional[torch.FloatTensor] = None,
-            position_ids: Optional[torch.LongTensor] = None,
-            packed_seq_params: Optional[PackedSeqParams] = None,
-            extra_block_kwargs: Optional[dict] = None,
+        self,
+        input_ids: torch.Tensor,
+        pixel_values: Optional[torch.Tensor] = None,
+        image_grid_thw: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        inference_params: Optional[InferenceParams] = None,
+        decoder_input: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+        extra_block_kwargs: Optional[dict] = None,
     ) -> Union[Dict[str, torch.Tensor], torch.Tensor]:
-  
-        if self.add_image_encoder:
+
+        vit_embeds = None
+
+        if self.add_image_encoder and pixel_values is not None:
             vit_embeds = self.image_encoder(pixel_values, image_grid_thw)
             vit_embeds = vit_embeds.reshape(-1, 1, vit_embeds.shape[-1]).clone()
-            output = vit_embeds
         else:
             vit_embeds = self.input_tensor
 
@@ -295,9 +297,12 @@ class Qwen2VLModel(MultiModalModule):
             if self.text_decoder.pre_process:
                 input_embeds = self.text_decoder.embedding(input_ids=input_ids, position_ids=position_ids).clone()
                 input_embeds = input_embeds.transpose(0, 1)
-                image_mask = torch.eq(input_ids, self.img_context_token_id).unsqueeze(-1).expand_as(input_embeds)
-                vit_embeds = vit_embeds[:, 0, :]
-                input_embeds = input_embeds.masked_scatter(image_mask, vit_embeds)
+            
+                if vit_embeds is not None:
+                    image_mask = torch.eq(input_ids, self.img_context_token_id).unsqueeze(-1).expand_as(input_embeds)
+                    vit_embeds = vit_embeds[:, 0, :]
+                    input_embeds = input_embeds.masked_scatter(image_mask, vit_embeds)
+            
                 input_embeds = input_embeds.transpose(0, 1).clone()
 
             past_seen_tokens = 0
diff --git a/mindspeed_mm/tasks/inference/pipeline/qwen2vl_pipeline.py b/mindspeed_mm/tasks/inference/pipeline/qwen2vl_pipeline.py
index 6cbcb596..57315a0f 100644
--- a/mindspeed_mm/tasks/inference/pipeline/qwen2vl_pipeline.py
+++ b/mindspeed_mm/tasks/inference/pipeline/qwen2vl_pipeline.py
@@ -40,11 +40,11 @@ class Qwen2VlPipeline(GenerationMixin):
 
         inputs = self.prepare_inputs(prompt=prompt, images=image)
 
-        if return_ids:
-            streamer = None
-        else:
+        # Use the model as a language model if no valid inputs are generated  
+        if inputs is None:
+            inputs = {'input_ids': self.tokenizer(prompt, return_tensors="pt").input_ids.to(self.infer_config.device)}
 
-            streamer = TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
+        streamer = None if return_ids else TextStreamer(self.tokenizer, skip_prompt=True, skip_special_tokens=True)
 
         generated_ids = self.generate(**inputs,
                                       do_sample=True if self.generation_config.temperature > 0 else False,
@@ -53,7 +53,7 @@ class Qwen2VlPipeline(GenerationMixin):
                                       streamer=streamer)
         if return_ids and generated_ids is not None:
             generated_ids = [
-                output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
+                output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs['input_ids'], generated_ids)
             ]
             out = self.image_processor.tokenizer.batch_decode(
                 generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
@@ -64,6 +64,9 @@ class Qwen2VlPipeline(GenerationMixin):
             return None
 
     def prepare_inputs(self, prompt=None, images=None, messages=None):
+        if not images and not messages:
+            return None
+
         if not messages:
             messages = [[
                 {
diff --git a/pretrain_qwen2vl.py b/pretrain_qwen2vl.py
index 98d217b4..4ab73b3a 100644
--- a/pretrain_qwen2vl.py
+++ b/pretrain_qwen2vl.py
@@ -27,14 +27,18 @@ def model_provider(pre_process=True, post_process=True):
     vlm_config.pre_process = pre_process
     vlm_config.post_process = post_process
 
-    vlm_config.image_encoder.vision_encoder = get_model_config(vlm_config.image_encoder.vision_encoder)
-    vlm_config.image_encoder.vision_projector = get_model_config(vlm_config.image_encoder.vision_projector)
-    vlm_config.text_decoder = get_model_config(vlm_config.text_decoder)
+    if vlm_config.image_encoder:
+        vlm_config.image_encoder.vision_encoder = get_model_config(vlm_config.image_encoder.vision_encoder)
+        vlm_config.image_encoder.vision_projector = get_model_config(vlm_config.image_encoder.vision_projector)
+        vlm_config.text_decoder = get_model_config(vlm_config.text_decoder)
 
-    model = Qwen2VLModel(vlm_config)
+        model = Qwen2VLModel(vlm_config)
 
-    model.freeze(freeze_image_encoder=getattr(vlm_config.image_encoder.vision_encoder, 'freeze', True), \
-        freeze_image_projection=getattr(vlm_config.image_encoder.vision_projector, 'freeze', True))
+        model.freeze(freeze_image_encoder=getattr(vlm_config.image_encoder.vision_encoder, 'freeze', True), \
+            freeze_image_projection=getattr(vlm_config.image_encoder.vision_projector, 'freeze', True))
+    else:
+        vlm_config.text_decoder = get_model_config(vlm_config.text_decoder)
+        model = Qwen2VLModel(vlm_config)
 
     return model
 
@@ -48,8 +52,12 @@ def get_batch(data_iterator):
     input_ids = batch['input_ids'].to(torch.cuda.current_device())
     labels = batch['labels'].to(torch.cuda.current_device())
     attention_mask = batch['attention_mask'].to(torch.cuda.current_device())
-    pixel_values = batch['pixel_values'].to(torch.cuda.current_device())
-    image_grid_thw = batch['image_grid_thw'].to(torch.cuda.current_device())
+    if 'pixel_values' in batch and 'image_grid_thw' in batch:
+        pixel_values = batch['pixel_values'].to(torch.cuda.current_device())
+        image_grid_thw = batch['image_grid_thw'].to(torch.cuda.current_device())
+    else:
+        pixel_values = None
+        image_grid_thw = None
     batch = {
         'input_ids': input_ids,
         'labels': labels,
diff --git a/redis_utils.py b/redis_utils.py
new file mode 100644
index 00000000..3d6eb621
--- /dev/null
+++ b/redis_utils.py
@@ -0,0 +1,86 @@
+import redis
+import json
+import time
+
+from concurrent.futures import ThreadPoolExecutor
+
+
+class RedisManager:
+    def __init__(self, host, port, db, model_name, ping_interval=10):
+        self.conn_pool = redis.ConnectionPool(host=host, port=int(port), db=db)
+        self.last_ping_time = time.time()
+        self.model_name = model_name
+        self.lock_prefix = 'lock:'
+        self.ping_interval = ping_interval
+
+    def _ping(self):
+        current_time = time.time()
+        if current_time - self.last_ping_time > self.ping_interval:
+            redis_conn = redis.Redis(connection_pool=self.conn_pool)
+            redis_conn.ping()
+            redis_conn.close()
+            self.last_ping_time = current_time
+
+    def write_data_to_result(self, model_name, message_id, data):
+        while True:
+            try:
+                self._ping()
+                break
+            except Exception as _:
+                continue
+        r = redis.Redis(connection_pool=self.conn_pool)
+        if message_id != "dummy":
+            r.hset(model_name + "_result", message_id, json.dumps(data))
+        r.close()
+
+    def fetch_single_request_without_queue(self):
+        r = redis.Redis(connection_pool=self.conn_pool)
+
+        request_data = None
+
+        # Directly access the fixed hash
+        message_ids = r.hkeys(self.model_name)
+        for message_id in message_ids:
+            lock_key = self.lock_prefix + message_id.decode("utf-8")
+            # Try to acquire a lock for the field
+            if r.setnx(lock_key, 1):  # If successfully set, we have acquired the lock
+                r.expire(lock_key, 10)  # Set an expiration time for the lock to avoid deadlocks
+                try:
+                    result = r.hget(self.model_name, message_id)
+                    if result:
+                        request_data = json.loads(result.decode("utf-8"))
+                        request_data["main_key"] = self.model_name
+                        request_data["message_id"] = message_id.decode("utf-8")
+                        # Delete the field after retrieval
+                        r.hdel(self.model_name, message_id)
+                finally:
+                    # Release the lock
+                    r.delete(lock_key)
+                break
+
+        r.close()
+
+        return request_data
+
+    def fetch_batch_of_requests(self, batch_size=4):
+        while True:
+            try:
+                self._ping()
+                break
+            except Exception as e:
+                print("Fail to connect Redis, retry")
+                continue
+
+        batch = []
+
+        with ThreadPoolExecutor(max_workers=batch_size) as executor:
+            futures = [
+                executor.submit(self.fetch_single_request_without_queue) for _ in range(batch_size)
+            ]
+
+            for future in futures:
+                result = future.result()
+                if result:
+                    batch.append(result)
+
+        return batch
-- 
Gitee


From 5370aaf49ff3af106237e238f7559fc5dc4de32d Mon Sep 17 00:00:00 2001
From: Luo Yiyang <luoyiyang2@huawei.com>
Date: Thu, 26 Dec 2024 15:25:26 +0800
Subject: [PATCH 2/4] update

---
 .../llava_instruct_2_mllm_demo_format.py      | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py
index 4bbd5a75..8c3b2d7b 100644
--- a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py
+++ b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py
@@ -2,8 +2,9 @@ import json
 import os
 import stat
 
-# llava_json_path = "./data/llava_instruct_150k.json"
-llava_json_path = "./data/llava_instruct_150k_wo_img.json"
+# llava_json_path = "./data/llava_instruct_150k_wo_img.json"
+# llava_json_path = "./data/ip.json"
+llava_json_path = "./data/full_data.json"
 mllm_format_json_path = "./data/mllm_format_llava_instruct_data.json"
 
 with open(llava_json_path, "r") as f:
@@ -12,14 +13,20 @@ with open(llava_json_path, "r") as f:
 mllm_format_llava_instruct_data = []
 for item in info_json:
     # img_path = os.path.join("./data/COCO2017/train2017", item["image"])
-    img_path = os.path.join("./data/dummy", item["image"])
+    # img_path = os.path.join("./data/dummy", item["image"])
+    img_path = os.path.join("./data", item["image"])
     if not os.path.exists(img_path):
         continue
-    new_item = {
-        "images": [img_path],
-        # "images": "",
-        "messages": []
-    }
+    if not img_path.endswith(".jpg") and not img_path.endswith(".png"):
+        new_item = {
+            "images": [],
+            "messages": []
+        } 
+    else:
+        new_item = {
+            "images": [img_path],
+            "messages": []
+        }
 
     for i, trun in enumerate(item["conversations"]):
         if trun["from"] == "human":
-- 
Gitee


From ab86f3e10c2b02894988b292ac1c8c360ae66bff Mon Sep 17 00:00:00 2001
From: Luo Yiyang <luoyiyang2@huawei.com>
Date: Mon, 30 Dec 2024 10:27:35 +0800
Subject: [PATCH 3/4] squash commit

---
 .gitignore                                    |   8 +-
 OWNERS                                        |   8 +-
 README.md                                     |  55 ++-
 examples/cogvideox/README.md                  | 114 +++--
 .../cogvideox/cogvideox_convert_to_mm_ckpt.py | 188 ++++++--
 examples/cogvideox/i2v_1.0/data.json          |  45 ++
 .../{ => i2v_1.0}/inference_cogvideox_i2v.sh  |   2 +-
 .../{ => i2v_1.0}/inference_model_i2v.json    |   1 +
 .../i2v_1.0/model_cogvideox_i2v.json          |   0
 .../{ => i2v_1.0}/pretrain_cogvideox_i2v.sh   |  10 +-
 examples/cogvideox/i2v_1.5/data.json          |  45 ++
 .../model_cogvideox_i2v_1.5.json}             | 256 +++++------
 .../i2v_1.5/pretrain_cogvideox_i2v.sh         | 105 +++++
 examples/cogvideox/t2v_1.0/data.json          |  45 ++
 .../inference_cogvideox_t2v.sh}               |   2 +-
 .../inference_model_t2v.json}                 |   1 +
 .../model_cogvideox_t2v.json}                 |   1 +
 .../{ => t2v_1.0}/pretrain_cogvideox_t2v.sh   |  10 +-
 examples/cogvideox/t2v_1.5/data.json          |  45 ++
 .../t2v_1.5/model_cogvideox_t2v_1.5.json      | 123 ++++++
 .../t2v_1.5/pretrain_cogvideox_t2v_1.5.sh     | 105 +++++
 examples/diffusers/flux/README.md             |  26 +-
 .../flux/infer_flux_text2img_distrib.py       |  63 +++
 .../flux/infer_flux_text2img_lora_bf16.py     |   1 +
 examples/diffusers/sd3/README.md              |  28 +-
 .../sd3/infer_sd3_text2img_distrib.py         |  75 ++++
 examples/diffusers/sdxl/README.md             |  38 +-
 .../sdxl/sdxl_text2img_distrib_infer.py       |  72 +++
 examples/internvl2/README.md                  |  82 +++-
 examples/internvl2/dot_product_attention.py   | 179 +++-----
 examples/internvl2/evaluate_internvl2_8B.sh   |   1 -
 examples/internvl2/finetune_internvl2_2B.sh   |   7 +-
 examples/internvl2/finetune_internvl2_76B.sh  |   7 +-
 examples/internvl2/finetune_internvl2_8B.sh   |   7 +-
 .../internvl2/finetune_internvl2_8B_vpp.sh    |   7 +-
 examples/internvl2/inference_internvl.sh      |   7 +-
 ...to_hg.py => internvl2_convert_mm_to_hf.py} |  49 +--
 ...kpt.py => internvl2_convert_to_mm_ckpt.py} |   7 +-
 examples/llava1.5/README.md                   | 137 ++----
 examples/llava1.5/clip_converter.py           | 147 +++++++
 examples/llava1.5/evaluate_llava1_5.sh        |   2 +-
 examples/llava1.5/inference_llava1_5.sh       |   6 +-
 examples/llava1.5/pretrain_llava1_5.sh        |  10 +-
 examples/llava1.5/vicuna_converter.py         | 135 ++++++
 examples/qwen2vl/README.md                    | 153 ++++---
 examples/qwen2vl/data_2b.json                 |   4 +-
 examples/qwen2vl/data_72b.json                |   4 +-
 examples/qwen2vl/data_7b.json                 |   6 +-
 examples/qwen2vl/dot_product_attention.py     |  10 +-
 examples/qwen2vl/evaluate_qwen2vl_7b.sh       |   1 -
 examples/qwen2vl/finetune_qwen2vl_2b.sh       |   3 +-
 examples/qwen2vl/finetune_qwen2vl_72b.sh      |   3 +-
 examples/qwen2vl/finetune_qwen2vl_7b.sh       |  15 +-
 examples/qwen2vl/inference_qwen2vl_2b.json    | 159 +++++++
 examples/qwen2vl/inference_qwen2vl_2b.sh      |  87 ++++
 examples/qwen2vl/inference_qwen2vl_72b.json   | 158 +++++++
 examples/qwen2vl/inference_qwen2vl_72b.sh     |  86 ++++
 examples/qwen2vl/inference_qwen2vl_7b.json    |   5 +-
 examples/qwen2vl/inference_qwen2vl_7b.sh      |  10 +-
 .../llava_instruct_2_mllm_demo_format.py      |  17 +-
 examples/qwen2vl/model_2b.json                |   4 +-
 examples/qwen2vl/qwen2vl_convert_pp_to_pp.py  |  33 ++
 examples/qwen2vl/qwen2vl_convert_to_hf.py     | 169 +++----
 .../qwen2vl/qwen2vl_convert_to_mm_ckpt.py     |  87 ++--
 inference_vlm.py                              |  97 +----
 .../models/ae/contextparallelcausalvae.py     |  53 ++-
 mindspeed_mm/models/common/attention.py       |  19 +-
 .../common/embeddings/patch_embeddings.py     |  41 +-
 .../common/embeddings/pos_embeddings.py       |  24 +-
 mindspeed_mm/models/common/normalize.py       |  20 +-
 mindspeed_mm/models/common/updownsample.py    |  46 +-
 .../models/diffusion/cogvideo_diffusion.py    |  43 +-
 .../models/diffusion/diffusers_scheduler.py   |   3 +
 mindspeed_mm/models/predictor/dits/sat_dit.py | 412 +++++++++++++-----
 .../models/predictor/predict_model.py         |  28 ++
 mindspeed_mm/models/qwen2vl_model.py          |  22 +-
 mindspeed_mm/models/sora_model.py             |  38 +-
 .../evaluation/eval_datasets/__init__.py      |   4 +-
 .../tasks/evaluation/eval_prompt/__init__.py  |   1 -
 .../eval_prompt/build_prompt_base.py          |   4 +
 .../eval_prompt/build_prompt_internvl.py      |   3 -
 .../inference/pipeline/cogvideox_pipeline.py  |   6 +-
 mindspeed_mm/tools/README.md                  |   1 +
 mindspeed_mm/tools/profiler.py                |  16 +-
 mindspeed_mm/tools/tools.json                 |   3 +-
 redis_utils.py                                |  86 ----
 .../st/shell_scripts/finetune_internvl2_8B.sh |   7 +-
 tests/st/shell_scripts/finetune_qwen2vl_7B.sh |   1 -
 .../shell_scripts/inference_qwen2vl_7b_pp1.sh |   3 -
 .../shell_scripts/inference_qwen2vl_7b_pp4.sh |   3 -
 tests/st/shell_scripts/pretrain_llava1_5.sh   |   6 +-
 .../embeddings/test_cogvideox_pos_emb.py      |  10 +
 .../diffusion/test_cogvideo_diffusion.py      | 121 +++++
 93 files changed, 3221 insertions(+), 1176 deletions(-)
 create mode 100644 examples/cogvideox/i2v_1.0/data.json
 rename examples/cogvideox/{ => i2v_1.0}/inference_cogvideox_i2v.sh (95%)
 rename examples/cogvideox/{ => i2v_1.0}/inference_model_i2v.json (99%)
 create mode 100644 examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json
 rename examples/cogvideox/{ => i2v_1.0}/pretrain_cogvideox_i2v.sh (88%)
 create mode 100644 examples/cogvideox/i2v_1.5/data.json
 rename examples/cogvideox/{model_cogvideox_i2v.json => i2v_1.5/model_cogvideox_i2v_1.5.json} (91%)
 create mode 100644 examples/cogvideox/i2v_1.5/pretrain_cogvideox_i2v.sh
 create mode 100644 examples/cogvideox/t2v_1.0/data.json
 rename examples/cogvideox/{inference_cogvideox.sh => t2v_1.0/inference_cogvideox_t2v.sh} (95%)
 rename examples/cogvideox/{inference_model.json => t2v_1.0/inference_model_t2v.json} (99%)
 rename examples/cogvideox/{model_cogvideox.json => t2v_1.0/model_cogvideox_t2v.json} (99%)
 rename examples/cogvideox/{ => t2v_1.0}/pretrain_cogvideox_t2v.sh (88%)
 create mode 100644 examples/cogvideox/t2v_1.5/data.json
 create mode 100644 examples/cogvideox/t2v_1.5/model_cogvideox_t2v_1.5.json
 create mode 100644 examples/cogvideox/t2v_1.5/pretrain_cogvideox_t2v_1.5.sh
 create mode 100644 examples/diffusers/flux/infer_flux_text2img_distrib.py
 create mode 100644 examples/diffusers/sd3/infer_sd3_text2img_distrib.py
 create mode 100644 examples/diffusers/sdxl/sdxl_text2img_distrib_infer.py
 rename examples/internvl2/{internvl2_convert_mm_to_hg.py => internvl2_convert_mm_to_hf.py} (92%)
 rename examples/internvl2/{internvl_convert_to_mm_ckpt.py => internvl2_convert_to_mm_ckpt.py} (99%)
 create mode 100644 examples/llava1.5/clip_converter.py
 create mode 100644 examples/llava1.5/vicuna_converter.py
 create mode 100644 examples/qwen2vl/inference_qwen2vl_2b.json
 create mode 100644 examples/qwen2vl/inference_qwen2vl_2b.sh
 create mode 100644 examples/qwen2vl/inference_qwen2vl_72b.json
 create mode 100644 examples/qwen2vl/inference_qwen2vl_72b.sh
 create mode 100644 examples/qwen2vl/qwen2vl_convert_pp_to_pp.py
 delete mode 100644 redis_utils.py
 create mode 100644 tests/ut/models/common/embeddings/test_cogvideox_pos_emb.py
 create mode 100644 tests/ut/models/diffusion/test_cogvideo_diffusion.py

diff --git a/.gitignore b/.gitignore
index 03b38668..7aa4dbb4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -157,10 +157,4 @@ cython_debug/
 /tests/st/run_jsons/
 /tests/st/run_logs/
 
-fusion_result.json
-
-MindSpeed/
-ckpt/
-data/
-dependencies/
-save_dir*/
\ No newline at end of file
+fusion_result.json
\ No newline at end of file
diff --git a/OWNERS b/OWNERS
index d1332558..c2a207e7 100644
--- a/OWNERS
+++ b/OWNERS
@@ -40,4 +40,10 @@ reviewers:
 - ghoshaw
 - zzztq
 - vectorwh
-- lu-jinfu1999
\ No newline at end of file
+- sunnylee219
+- chenhaihui994
+- mr-lin314
+- hemiuhui
+- lu-jinfu1999
+- liyx616
+- xiaoqiao12345
diff --git a/README.md b/README.md
index edae6ed9..06ed5f60 100644
--- a/README.md
+++ b/README.md
@@ -19,6 +19,7 @@ MindSpeed-MM是面向大规模分布式训练的昇腾多模态大模型套件
 
 ## 🔥🔥🔥Latest News
 
+* [Dec. 19, 2024]: 🎉 MindSpeed-MM生成类模型支持分布式推理
 * [Dec. 16, 2024]: 🚀 MindSpeed-MM支持Qihoo-T2X模型
 * [Dec. 05, 2024]: 🎉 MindSpeed-MM理解类模型支持Lora微调
 * [Dec. 03, 2024]: 🚀 MindSpeed-MM支持SD3.5模型
@@ -39,11 +40,11 @@ MindSpeed-MM是面向大规模分布式训练的昇腾多模态大模型套件
 | CogVideoX-T2V | ✔ |  |  |  | CP (Ulysses) | ✔ | ✔ |  |
 | CogVideoX-I2V | ✔ |  |  |  | CP (Ulysses) | ✔ | ✔ |  |
 | Opensora1.2 |  |  |  |  | DSP | ✔ | ✔ |  |
-| OpensoraPlan1.3-T2V | ✔ | ✔ |  |  | CP (Ulysses) | ✔ | ✔ |  |
-| OpensoraPlan1.3-I2V | ✔ | ✔ |  |  | CP (Ulysses) | ✔ | ✔ |  |
-| InternVL2-2B |  |  | ✔ | ✔ |  | ✔ | ✔ | ✔ |
-| InternVL2-8B |  |  | ✔ | ✔ |  | ✔ | ✔ | ✔ |
-| InternVL2-76B |  |  | ✔ | ✔ |  | ✔ | ✔ | ✔ |
+| OpensoraPlan1.3-T2V | ✔ | ✔ |  | ✔ | CP (Ulysses) | ✔ | ✔ |  |
+| OpensoraPlan1.3-I2V | ✔ | ✔ |  | ✔ | CP (Ulysses) | ✔ | ✔ |  |
+| InternVL2-2B |  |  | ✔ | ✔ |  | ✔ | ✔ |  |
+| InternVL2-8B |  |  | ✔ | ✔ |  | ✔ | ✔ |  |
+| InternVL2-76B |  |  | ✔ | ✔ |  | ✔ | ✔ |  |
 | Qwen2VL-2B |  |  |  | ✔ |  | ✔ | ✔ | ✔ |
 | Qwen2VL-7B |  |  |  | ✔ |  | ✔ | ✔ | ✔ |
 | Qwen2VL-72B |  |  |  | ✔ |  | ✔ | ✔ | ✔ |
@@ -60,6 +61,7 @@ MindSpeed-MM是面向大规模分布式训练的昇腾多模态大模型套件
 * Distributed Optimizer: [Zero Redundancy Optimizer](https://arxiv.org/abs/1910.02054) (ZeRO)
 * Recomputation: Reducing Activation [Recomputation](https://arxiv.org/abs/2205.05198)
 * LoRA: [Low-Rank Adaptation](https://arxiv.org/abs/2106.09685)
+
 ---
 
 ## 研发中的特性与模型
@@ -109,6 +111,8 @@ MindSpeed-MM已发布版本维护策略：
 
 Samples per Second 为 (SPS); Frames per Second 为 (FPS); Tokens per Second 为 (TPS)
 
+`亲和场景`为调整少量结构或参数，使得模型更加亲和昇腾，性能更优
+
 <table>
   <a id="jump1"></a>
   <caption>MindSpeed-MM模型列表</caption>
@@ -127,7 +131,7 @@ Samples per Second 为 (SPS); Frames per Second 为 (FPS); Tokens per Second 为
   </thead>
   <tbody>
     <tr>
-      <td rowspan="15"> 多模态生成 </td>
+      <td rowspan="17"> 多模态生成 </td>
       <td><a href="https://gitee.com/ascend/MindSpeed-MM/tree/master/examples/opensora1.0">OpenSora 1.0</a></td>
       <td><a href="https://huggingface.co/hpcai-tech/Open-Sora/tree/main">5.5B</a></td>
       <td> 预训练 </td>
@@ -178,23 +182,41 @@ Samples per Second 为 (SPS); Frames per Second 为 (FPS); Tokens per Second 为
       <td>【Pass】</td>
     </tr>
     <tr>
-      <td><a href="https://gitee.com/ascend/MindSpeed-MM/tree/master/examples/cogvideox">CogVideoX-T2V</a></td>
-      <td><a href="https://huggingface.co/THUDM/CogVideoX-5b">5B</a></td>
+      <td rowspan="2"><a href="https://gitee.com/ascend/MindSpeed-MM/tree/master/examples/cogvideox">CogVideoX-T2V</a></td>
+      <td><a href="https://huggingface.co/THUDM/CogVideoX-5b"> 5B </a></td>
       <td> 预训练 </td>
       <td> 1x8 </td>
       <td> BF16 </td>
-      <td> / </td>
-      <td> / </td>
+      <td> 0.37 (SPS) </td>
+      <td> 0.46 (SPS) </td>
       <td>【Pass】</td>
     </tr>
     <tr>
-      <td><a href="https://gitee.com/ascend/MindSpeed-MM/tree/master/examples/cogvideox">CogVideoX-I2V</a></td>
-      <td><a href="https://huggingface.co/THUDM/CogVideoX-5b">5B</a></td>
+    <td><a href="https://huggingface.co/THUDM/CogVideoX-5b"> 亲和场景 </a></td>
       <td> 预训练 </td>
       <td> 1x8 </td>
       <td> BF16 </td>
-      <td> / </td>
-      <td> / </td>
+      <td> 0.92 (SPS) </td>
+      <td> 0.96 (SPS) </td>
+      <td>【Pass】</td>
+    </tr>
+    <tr>
+      <td rowspan="2"><a href="https://gitee.com/ascend/MindSpeed-MM/tree/master/examples/cogvideox">CogVideoX-I2V</a></td>
+      <td><a href="https://huggingface.co/THUDM/CogVideoX-5b"> 5B </a></td>
+      <td> 预训练 </td>
+      <td> 1x8 </td>
+      <td> BF16 </td>
+      <td> 0.37 (SPS) </td>
+      <td> 0.46 (SPS) </td>
+      <td>【Pass】</td>
+    </tr>
+    <tr>
+    <td><a href="https://huggingface.co/THUDM/CogVideoX-5b"> 亲和场景 </a></td>
+      <td> 预训练 </td>
+      <td> 1x8 </td>
+      <td> BF16 </td>
+      <td> 0.92 (SPS) </td>
+      <td> 0.96 (SPS) </td>
       <td>【Pass】</td>
     </tr>
     <tr>
@@ -306,7 +328,7 @@ Samples per Second 为 (SPS); Frames per Second 为 (FPS); Tokens per Second 为
       <td>【Pass】</td>
     </tr>
     <tr>
-      <td><a href="https://huggingface.co/OpenGVLab/InternVL2-26B">76B</a></td>
+      <td><a href="https://huggingface.co/OpenGVLab/InternVL2-Llama3-76B">76B</a></td>
       <td> 全参微调 </td>
       <td> 8x16 </td>
       <td> BF16 </td>
@@ -548,7 +570,8 @@ MindSpeed-MM 由华为公司的下列部门联合贡献 ：
 * 华为云
 
 MindSpeed-MM 生态贡献方：
-* 奇虎360
+
+* 360 AI Research
 
 感谢来自社区的每一个PR，欢迎贡献 MindSpeed-MM
 
diff --git a/examples/cogvideox/README.md b/examples/cogvideox/README.md
index 4f7b7ad1..8880879c 100644
--- a/examples/cogvideox/README.md
+++ b/examples/cogvideox/README.md
@@ -82,13 +82,13 @@
 #### 仓库拉取
 
 ```shell
-    git clone https://gitee.com/ascend/MindSpeed-MM.git 
-    git clone https://github.com/NVIDIA/Megatron-LM.git
-    cd Megatron-LM
-    git checkout core_r0.6.0
-    cp -r megatron ../MindSpeed-MM/
-    cd ..
-    cd MindSpeed-MM
+git clone https://gitee.com/ascend/MindSpeed-MM.git 
+git clone https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+git checkout core_r0.6.0
+cp -r megatron ../MindSpeed-MM/
+cd ..
+cd MindSpeed-MM
 ```
 <a id="jump2.2"></a>
 #### 环境搭建
@@ -97,31 +97,31 @@
 torch npu 与 CANN包参考链接：[安装包参考链接](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software)
 
 ```bash
-    # python3.10
-    conda create -n test python=3.10
-    conda activate test
-
-    # 安装 torch 和 torch_npu，注意要选择对应python版本、x86或arm的torch、torch_npu及apex包
-    pip install torch-2.1.0-cp310-cp310m-manylinux2014_aarch64.whl 
-    pip install torch_npu-2.1.0*-cp310-cp310m-linux_aarch64.whl
-    
-    # apex for Ascend 参考 https://gitee.com/ascend/apex
-    pip install apex-0.1_ascend*-cp310-cp310m-linux_aarch64.whl
-
-    # 将shell脚本中的环境变量路径修改为真实路径，下面为参考路径
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh 
-
-    # 安装加速库
-    git clone https://gitee.com/ascend/MindSpeed.git
-    cd MindSpeed
-    # checkout commit from MindSpeed core_r0.6.0
-    git checkout 5dc1e83b
-    pip install -r requirements.txt 
-    pip3 install -e .
-    cd ..
-
-    # 安装其余依赖库
-    pip install -e .
+# python3.10
+conda create -n test python=3.10
+conda activate test
+
+# 安装 torch 和 torch_npu，注意要选择对应python版本、x86或arm的torch、torch_npu及apex包
+pip install torch-2.1.0-cp310-cp310m-manylinux2014_aarch64.whl 
+pip install torch_npu-2.1.0*-cp310-cp310m-linux_aarch64.whl
+
+# apex for Ascend 参考 https://gitee.com/ascend/apex
+pip install apex-0.1_ascend*-cp310-cp310m-linux_aarch64.whl
+
+# 将shell脚本中的环境变量路径修改为真实路径，下面为参考路径
+source /usr/local/Ascend/ascend-toolkit/set_env.sh 
+
+# 安装加速库
+git clone https://gitee.com/ascend/MindSpeed.git
+cd MindSpeed
+# checkout commit from MindSpeed core_r0.6.0
+git checkout 5dc1e83b
+pip install -r requirements.txt 
+pip install -e .
+cd ..
+
+# 安装其余依赖库
+pip install -e .
 ```
 <a id="jump2.3"></a>
 #### Decord搭建
@@ -220,27 +220,31 @@ data.jsonl文件内容如下示例：
 
 <a id="jump5.2"></a>
 #### 配置参数
-需根据实际任务情况修改`model_cogvideox.json`、`model_cogvideox_i2v.json`和`data.json`中的权重和数据集路径，包括`from_pretrained`、`data_path`、`data_folder`字段。
+需根据实际任务情况修改`model_cogvideox_t2v_t2v.json`、`model_cogvideox_i2v.json`和`data.json`中的权重和数据集路径，包括`from_pretrained`、`data_path`、`data_folder`字段。
 
-在sh启动脚本中可以修改运行卡数：
+`model_cogvideox_t2v.json`/`model_cogvideox_i2v.json`文件中的`head_dim`字段原模型默认配置为64。此字段调整为128会更加亲和昇腾。
+
+`model_cogvideox_t2v.json`/`model_cogvideox_i2v.json`文件中的`head_dim`字段原模型默认配置为64。此字段调整为128会更加亲和昇腾。
+
+在sh启动脚本中可以修改运行卡数(NNODES为节点数，GPUS_PER_NODE为每个节点的卡数，相乘即为总运行卡数)：
 ```shell
-    GPUS_PER_NODE=8
-    MASTER_ADDR=locahost
-    MASTER_PORT=29501
-    NNODES=1  
-    NODE_RANK=0  
-    WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
+GPUS_PER_NODE=8
+MASTER_ADDR=locahost
+MASTER_PORT=29501
+NNODES=1  
+NODE_RANK=0  
+WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
 ```
 <a id="jump5.3"></a>
 #### 启动预训练
 
 t2v任务启动预训练
 ```shell
-    bash examples/cogvideox/pretrain_cogvideox_t2v.sh
+bash examples/cogvideox/t2v_1.0/pretrain_cogvideox_t2v.sh
 ```
 i2v任务启动预训练
 ```shell
-    bash examples/cogvideox/pretrain_cogvideox_i2v.sh
+bash examples/cogvideox/i2v_1.0/pretrain_cogvideox_i2v.sh
 ```
 ---
 
@@ -255,12 +259,20 @@ i2v任务启动预训练
 <a id="jump6.2"></a>
 #### 配置参数
 
-检查如下配置是否完成
+检查对应配置是否完成
+
+| t2v配置文件                                           |               修改字段               |                修改说明                 |
+|---------------------------------------------------|:--------------------------------:|:-----------------------------------:|
+| examples/cogvideox/t2v_*/inference_model_t2v.json |         from_pretrained          |            修改为下载的权重所对应路径            |
+| examples/cogvideox/samples_prompts.txt            |               文件内容               |      可自定义自己的prompt，一行为一个prompt      |
+
+
+| i2v配置文件                                           |               修改字段               |       修改说明       |
+|---------------------------------------------------|:--------------------------------:|:----------------:|
+| examples/cogvideox/i2v_*/inference_model_i2v.json |         from_pretrained          |  修改为下载的权重所对应路径   |
+| examples/cogvideox/samples_i2v_images.txt         |               文件内容               |       图片路径       |
+| examples/cogvideox/samples_i2v_prompts.txt        |               文件内容               |    自定义prompt     |
 
-| 配置文件 |               修改字段               |                修改说明                 |
-|------|:--------------------------------:|:-----------------------------------:|
-|  examples/cogvideox/inference_model.json    |         from_pretrained          |            修改为下载的权重所对应路径            |
-|   examples/cogvideox/samples_prompts.txt   |               文件内容               |      可自定义自己的prompt，一行为一个prompt      |
 
 如果使用训练后保存的权重进行推理，需要使用脚本进行转换，权重转换source_path参数请配置训练时的保存路径
 ```bash
@@ -269,9 +281,15 @@ python examples/cogvideox/cogvideox_convert_to_mm_ckpt.py --source_path <your so
 
 <a id="jump6.3"></a>
 #### 启动推理
+t2v 启动推理脚本
+
+```bash
+bash examples/cogvideox/t2v_1.0/inference_cogvideox_t2v.sh
+```
+i2v 启动推理脚本
 
 ```bash
-bash examples/cogvideox/inference_cogvideox.sh
+bash examples/cogvideox/i2v_1.0/inference_cogvideox_i2v.sh
 ```
 
 ---
diff --git a/examples/cogvideox/cogvideox_convert_to_mm_ckpt.py b/examples/cogvideox/cogvideox_convert_to_mm_ckpt.py
index 82dd673f..28dad428 100644
--- a/examples/cogvideox/cogvideox_convert_to_mm_ckpt.py
+++ b/examples/cogvideox/cogvideox_convert_to_mm_ckpt.py
@@ -28,6 +28,16 @@ CONVERT_MAPPING = {
     "mixins.final_layer.adaLN_modulation.1.bias": "adaLN_modulation.1.bias"
 }
 
+first_pipeline_stage_keys = ["time_embed.time_embed.0.bias", "time_embed.time_embed.0.weight",
+                    "time_embed.time_embed.2.bias", "time_embed.time_embed.2.weight",
+                    "patch_embed.proj.bias", "patch_embed.proj.weight",
+                    "caption_projection.bias", "caption_projection.weight"]
+
+last_pipeline_stage_keys = ["norm_final.weight", "norm_final.bias",
+                            "norm_out.weight", "norm_out.bias",
+                            "proj_out.weight", "proj_out.bias",
+                            "adaLN_modulation.1.weight", "adaLN_modulation.1.bias"]
+
 
 def update_state_dict_inplace(
     state_dict: Dict[str, Any],
@@ -108,12 +118,60 @@ def split_by_tp(state_dict: Dict[str, Any], tp_size: int = 2, num_layers: int =
                 wv = torch.chunk(wv, tp_size, dim=0)[tp_rank]
                 weight = torch.cat([wq, wk, wv], dim=0)
                 new_state_dict[split_name] = weight
+        # adaLN modulation
+        col_split_names = [
+            "adaLN_modulation.1.weight",
+            "adaLN_modulation.1.bias",
+        ]
+        for split_name in col_split_names:
+            new_state_dict[split_name] = torch.chunk(state_dict[split_name], tp_size, dim=0)[tp_rank]
+        # adaLN modulation
+        col_split_names = [
+            "adaLN_modulation.1.weight",
+            "adaLN_modulation.1.bias",
+        ]
+        for split_name in col_split_names:
+            new_state_dict[split_name] = torch.chunk(state_dict[split_name], tp_size, dim=0)[tp_rank]
         new_state_dicts.append(new_state_dict)
     
     return new_state_dicts
 
 
-def save_by_tp(state_dicts: List[Dict], save_dir: str, latest_checkpointed_iteration='release'):
+def split_by_pp(state_dicts: List[Dict[str, Any]], pp_sizes: List, remove_pos_emb: bool = False) -> Dict[tuple, Dict]:
+    if len(pp_sizes) == 1:
+        new_state_dicts = {}
+        for tp_rank, state_dict in enumerate(state_dicts):
+            new_state_dicts[(0, tp_rank)] = state_dict
+        return new_state_dicts
+
+    new_state_dicts = {}
+    for pp_rank, num_layers in enumerate(pp_sizes):
+        start_layer_index, end_layer_index = sum(pp_sizes[:pp_rank]), sum(pp_sizes[:pp_rank + 1])
+        is_pipeline_first_stage = pp_rank == 0
+        is_pipeline_last_stage = pp_rank == len(pp_sizes) - 1
+
+        for tp_rank, state_dict in enumerate(state_dicts):
+            pp_tp_param = dict()
+
+            for i in range(start_layer_index, end_layer_index):
+                layer_names = get_layer_mapping(i).values()
+                pp_layer_names = get_layer_mapping(i - start_layer_index).values()
+
+                for pp_layer_name, layer_name in zip(pp_layer_names, layer_names):
+                    pp_tp_param[pp_layer_name] = state_dict[layer_name]
+                
+            if is_pipeline_first_stage:
+                for layer_name in first_pipeline_stage_keys:
+                    pp_tp_param[layer_name] = state_dict[layer_name]
+            if is_pipeline_last_stage:
+                for layer_name in last_pipeline_stage_keys:
+                    pp_tp_param[layer_name] = state_dict[layer_name]
+            new_state_dicts[(pp_rank, tp_rank)] = pp_tp_param
+
+    return new_state_dicts
+
+
+def save_by_tp_pp(state_dicts: Dict[tuple, Dict], save_dir: str, enable_pp: bool, latest_checkpointed_iteration='release'):
     if not os.path.exists(save_dir):
         os.makedirs(save_dir)
 
@@ -126,36 +184,24 @@ def save_by_tp(state_dicts: List[Dict], save_dir: str, latest_checkpointed_itera
     else:
         directory = 'iter_{:07d}'.format(latest_checkpointed_iteration)
 
-    for tp_rank, state_dict in enumerate(state_dicts):
-        os.makedirs(os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}"))
-        save_path = os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}", "model_optim_rng.pt")
+    for (pp_rank, tp_rank), state_dict in state_dicts.items():
+        if enable_pp:
+            os.makedirs(os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}_{pp_rank:03d}"))
+            save_path = os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}_{pp_rank:03d}", "model_optim_rng.pt")
+        else:
+            os.makedirs(os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}"))
+            save_path = os.path.join(save_dir, directory, f"mp_rank_{tp_rank:02d}", "model_optim_rng.pt")
         save_dict = {}
         save_dict['model'] = state_dict
         torch.save(save_dict, save_path)
 
 
-def merge_by_tp(train_save_dir: str, save_path: str, num_layers: int, tp_size: int):
-    flags = os.O_RDONLY
-    mode = stat.S_IRUSR
-    with os.fdopen(os.open(os.path.join(train_save_dir, "latest_checkpointed_iteration.txt"), flags, mode)) as f:
-        latest_checkpointed_iteration = f.readline()
-
-    if latest_checkpointed_iteration == 'release':
-        directory = 'release'
-    else:
-        directory = 'iter_{:07d}'.format(latest_checkpointed_iteration)    
-
-    _state_dicts = []
-    for tp_rank in range(tp_size):
-        state_dict_path = os.path.join(train_save_dir, directory, f"mp_rank_{tp_rank:02d}", "model_optim_rng.pt")
-        _state_dicts.append(torch.load(state_dict_path)['model'])
-    
+def merge_by_tp(state_dicts: Dict[str, Any], num_layers: int, tp_size: int, is_last_pp_stage: bool):
     if tp_size == 1:
-        torch.save(_state_dicts[0], save_path)
-        return 
+        return state_dicts[0]
 
-    merged_state_dict = copy.deepcopy(_state_dicts[0])
-    for index in num_layers:
+    merged_state_dict = copy.deepcopy(state_dicts[0])
+    for index in range(range(num_layers)):
         # ColumnParallelLinear
         suffixed_0 = [
             f"videodit_blocks.{index}.ff.net.0.proj.weight",
@@ -174,22 +220,84 @@ def merge_by_tp(train_save_dir: str, save_path: str, num_layers: int, tp_size: i
             f"videodit_blocks.{index}.self_atten.proj_qkv.bias"
         ]
         for name in suffixed_0:
-            parameters = [_state_dicts[tp_rank][name] for tp_rank in range(tp_size)]
+            parameters = [state_dicts[tp_rank][name] for tp_rank in range(tp_size)]
             parameters = torch.cat(parameters, dim=0)
             merged_state_dict[name] = parameters
         for name in suffixed_1:
-            parameters = [_state_dicts[tp_rank][name] for tp_rank in range(tp_size)]
+            parameters = [state_dicts[tp_rank][name] for tp_rank in range(tp_size)]
             parameters = torch.cat(parameters, dim=1)
             merged_state_dict[name] = parameters
         for name in suffixed_special:
-            wq = [torch.chunk(_state_dicts[tp_rank][name], 3, dim=0)[0] for tp_rank in range(tp_size)]
-            wk = [torch.chunk(_state_dicts[tp_rank][name], 3, dim=0)[1] for tp_rank in range(tp_size)]
-            wv = [torch.chunk(_state_dicts[tp_rank][name], 3, dim=0)[2] for tp_rank in range(tp_size)]
+            wq = [torch.chunk(state_dicts[tp_rank][name], 3, dim=0)[0] for tp_rank in range(tp_size)]
+            wk = [torch.chunk(state_dicts[tp_rank][name], 3, dim=0)[1] for tp_rank in range(tp_size)]
+            wv = [torch.chunk(state_dicts[tp_rank][name], 3, dim=0)[2] for tp_rank in range(tp_size)]
             wq = torch.cat(wq, dim=0)
             wk = torch.cat(wk, dim=0)
             wv = torch.cat(wv, dim=0)
             wqkv = torch.cat([wq, wk, wv], dim=0)
             merged_state_dict[name] = wqkv
+
+        if is_last_pp_stage:
+            # adaLN modulation
+            col_split_names = [
+                "adaLN_modulation.1.weight",
+                "adaLN_modulation.1.bias",
+            ]
+            for split_name in col_split_names:
+                merged_state_dict[split_name] = torch.cat([state_dicts[tp_rank][split_name] for tp_rank in range(tp_size)])
+    return merged_state_dict
+
+
+def merge_by_pp(state_dicts: Dict[str, Any], pp_sizes: list):
+    if len(pp_sizes) == 1:
+        return state_dicts[0]
+
+    merged_state_dict = {}
+    for key in first_pipeline_stage_keys:
+        merged_state_dict[key] = state_dicts[0][key]
+    for i, pp_size in enumerate(pp_sizes):
+        for layer_index in range(pp_size):
+            pp_layer_names = get_layer_mapping(layer_index).values()
+            layer_names = get_layer_mapping(layer_index + sum(pp_sizes[:i])).values()
+            for pp_layer_name, layer_name in zip(pp_layer_names, layer_names):
+                merged_state_dict[layer_name] = state_dicts[i][pp_layer_name]
+    for key in last_pipeline_stage_keys:
+        merged_state_dict[key] = state_dicts[-1][key]
+    return merged_state_dict
+
+
+def merge_by_tp_pp(train_save_dir: str, save_path: str, tp_size: int, pp_sizes: list):
+    flags = os.O_RDONLY
+    mode = stat.S_IRUSR
+    with os.fdopen(os.open(os.path.join(train_save_dir, "latest_checkpointed_iteration.txt"), flags, mode)) as f:
+        latest_checkpointed_iteration = f.readline()
+
+    if latest_checkpointed_iteration == 'release':
+        directory = 'release'
+    else:
+        directory = 'iter_{:07d}'.format(latest_checkpointed_iteration)    
+
+    _pp_state_dicts = []
+    for pp_rank, pp_size in enumerate(pp_sizes):
+        _tp_state_dicts = []
+        for tp_rank in range(tp_size):
+            if len(pp_sizes) > 1:
+                state_dict_path = os.path.join(train_save_dir, directory, f"mp_rank_{tp_rank:02d}_{pp_rank:03d}", "model_optim_rng.pt")
+            else:
+                state_dict_path = os.path.join(train_save_dir, directory, f"mp_rank_{tp_rank:02d}", "model_optim_rng.pt")
+            _tp_state_dicts.append(torch.load(state_dict_path)['model'])
+        is_last_pp_stage = pp_rank == len(pp_sizes) - 1
+        merged_tp_state_dict = merge_by_tp(_tp_state_dicts, num_layers=pp_sizes[pp_rank], tp_size=tp_size, is_last_pp_stage=is_last_pp_stage)
+        _pp_state_dicts.append(merged_tp_state_dict)
+    merged_state_dict = merge_by_pp(_pp_state_dicts, pp_sizes=pp_sizes)
+
+        # adaLN modulation
+        col_split_names = [
+            "adaLN_modulation.1.weight",
+            "adaLN_modulation.1.bias",
+        ]
+        for split_name in col_split_names:
+            merged_state_dict[split_name] = torch.cat([_state_dicts[tp_rank][split_name] for tp_rank in range(tp_size)])
     torch.save(merged_state_dict, save_path)
     return 
 
@@ -197,11 +305,12 @@ def merge_by_tp(train_save_dir: str, save_path: str, num_layers: int, tp_size: i
 def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--tp_size", type=int, default=2, help="Tensor model parallel world size")
+    parser.add_argument("--pp_sizes", type=int, nargs='+', help="Pipeline parallel model split sizes")
     parser.add_argument("--num_layers", type=int, default=42, help="Layer numbers of video_dit")
     parser.add_argument("--source_path", type=str, default="./transformer/1/mp_rank_00_model_states.pt", help="Source path of checkpoint")
     parser.add_argument("--target_path", type=str, default="./ckpt/sat_dit/", help="Save path of MM checkpoint")
     parser.add_argument("--task", type=str, default="t2v", choices=["t2v", "i2v"], help="Task type")
-    parser.add_argument("--remove_pos_emb", type=bool, default=False, help="remove_pos_emb")
+    parser.add_argument("--remove_pos_emb", action="store_true", help="remove_pos_emb")
     parser.add_argument("--mode", type=str, default="split", choices=["split", "merge"], 
         help="Split mode is used to split the pretrained weights according to tp_size before training, \
         and Merge mode is used to merge weights based on tp_size after training is completed")
@@ -228,12 +337,25 @@ if __name__ == "__main__":
         remove_layers(source_state_dict, remove_keys)
 
         if args.remove_pos_emb:
-            remove_layers(source_state_dict, ["pos_embed.freq_cos", "pos_embed.freq_sin"])
+            remove_layers(source_state_dict, ["pos_embed.freqs_cos", "pos_embed.freqs_sin"])
             if args.task == "i2v":
                 remove_layers(source_state_dict, ["pos_embed.pos_embedding"])
+        else: 
+            first_pipeline_stage_keys.append("pos_embed.freqs_cos")
+            first_pipeline_stage_keys.append("pos_embed.freqs_sin")
+            if args.task == "i2v":
+                first_pipeline_stage_keys.append("pos_embed.pos_embedding")
+        
+        if sum(args.pp_sizes) != args.num_layers:
+            raise ValueError(f"The sum of args.pp_sizes {args.pp_sizes} must be equal to args.num_layers {args.num_layers}")
 
         state_dicts = split_by_tp(source_state_dict, tp_size=args.tp_size, num_layers=args.num_layers)
-        save_by_tp(state_dicts, args.target_path)
+        state_dicts = split_by_pp(state_dicts, pp_sizes=args.pp_sizes, remove_pos_emb=args.remove_pos_emb)
+        save_by_tp_pp(state_dicts, args.target_path, enable_pp=len(args.pp_sizes) > 1)
     
     elif args.mode == 'merge':
-        merge_by_tp(args.source_path, args.target_path, args.num_layers, args.tp_size)
\ No newline at end of file
+        first_pipeline_stage_keys.append("pos_embed.freqs_cos")
+        first_pipeline_stage_keys.append("pos_embed.freqs_sin")
+        if args.task == "i2v":
+            first_pipeline_stage_keys.append("pos_embed.pos_embedding")
+        merge_by_tp_pp(args.source_path, args.target_path, tp_size=args.tp_size, pp_sizes=args.pp_sizes)
\ No newline at end of file
diff --git a/examples/cogvideox/i2v_1.0/data.json b/examples/cogvideox/i2v_1.0/data.json
new file mode 100644
index 00000000..6ecd74aa
--- /dev/null
+++ b/examples/cogvideox/i2v_1.0/data.json
@@ -0,0 +1,45 @@
+{
+	"dataset_param": {
+		"dataset_type": "t2v",
+		"use_feature_data": false,
+		"basic_parameters": {
+			"data_path": "/data_path/data.jsonl",
+			"data_folder": "/data_path",
+			"data_storage_mode": "standard"
+		},
+		"preprocess_parameters": {
+            "data_process_type": "CogvideoX",
+			"video_reader_type": "decoder",
+            "fps": 8,
+            "skip_frame_num": 3,
+			"num_frames": 25,
+			"max_height": 480,
+			"max_width": 720,
+			"dataloader_num_workers": 8,
+			"train_pipeline": {
+				"video": [],
+				"image": []
+			}
+		},
+		"use_text_processer": true,
+		"enable_text_preprocessing": false,
+		"model_max_length": 226,
+		"tokenizer_config": {
+			"hub_backend": "hf",
+			"autotokenizer_name": "T5Tokenizer",
+			"from_pretrained": "5b-cogvideo/tokenizer"
+		}
+	},
+	"dataloader_param": {
+		"dataloader_mode": "sampler",
+        "sampler_type": "SequentialSampler",
+		"batch_size": 1,
+		"num_workers": 8,
+		"shuffle": true,
+		"drop_last": true,
+		"pin_memory": true,
+		"group_frame": false,
+		"group_resolution": false,
+		"collate_param": {}
+	}
+}
\ No newline at end of file
diff --git a/examples/cogvideox/inference_cogvideox_i2v.sh b/examples/cogvideox/i2v_1.0/inference_cogvideox_i2v.sh
similarity index 95%
rename from examples/cogvideox/inference_cogvideox_i2v.sh
rename to examples/cogvideox/i2v_1.0/inference_cogvideox_i2v.sh
index 72b98cc6..9f35528e 100644
--- a/examples/cogvideox/inference_cogvideox_i2v.sh
+++ b/examples/cogvideox/i2v_1.0/inference_cogvideox_i2v.sh
@@ -14,7 +14,7 @@ CP=1
 MBS=1
 GBS=$(($WORLD_SIZE*$MBS/$CP/$TP))
 
-MM_MODEL="examples/cogvideox/inference_model_i2v.json"
+MM_MODEL="examples/cogvideox/i2v_1.0/inference_model_i2v.json"
 LOAD_PATH="your_converted_dit_ckpt_dir"
 
 DISTRIBUTED_ARGS="
diff --git a/examples/cogvideox/inference_model_i2v.json b/examples/cogvideox/i2v_1.0/inference_model_i2v.json
similarity index 99%
rename from examples/cogvideox/inference_model_i2v.json
rename to examples/cogvideox/i2v_1.0/inference_model_i2v.json
index 069e10f1..d7370c59 100644
--- a/examples/cogvideox/inference_model_i2v.json
+++ b/examples/cogvideox/i2v_1.0/inference_model_i2v.json
@@ -85,6 +85,7 @@
         "cross_attention_dim": null,
         "attention_bias": true,
         "input_size": [13, 60, 90],
+        "patch_type": "2D",
         "patch_size": [1, 2, 2],
         "activation_fn": "gelu-approximate",
         "num_embeds_ada_norm": 1000,
diff --git a/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json b/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json
new file mode 100644
index 00000000..e69de29b
diff --git a/examples/cogvideox/pretrain_cogvideox_i2v.sh b/examples/cogvideox/i2v_1.0/pretrain_cogvideox_i2v.sh
similarity index 88%
rename from examples/cogvideox/pretrain_cogvideox_i2v.sh
rename to examples/cogvideox/i2v_1.0/pretrain_cogvideox_i2v.sh
index 6ad9b68e..7ecc7157 100644
--- a/examples/cogvideox/pretrain_cogvideox_i2v.sh
+++ b/examples/cogvideox/i2v_1.0/pretrain_cogvideox_i2v.sh
@@ -7,7 +7,7 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
 MASTER_PORT=29505
@@ -21,8 +21,8 @@ CP=1
 MBS=1
 GBS=$(($WORLD_SIZE*$MBS/$CP))
 
-MM_DATA="./examples/cogvideox/data.json"
-MM_MODEL="./examples/cogvideox/model_cogvideox_i2v.json"
+MM_DATA="./examples/cogvideox/i2v_1.0/data.json"
+MM_MODEL="./examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json"
 MM_TOOL="./mindspeed_mm/tools/tools.json"
 
 DISTRIBUTED_ARGS="
@@ -100,5 +100,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_sora.py \
 
 chmod 440 logs/train_${logfile}.log
 STEP_TIME=`grep "elapsed time per iteration" logs/train_${logfile}.log | awk -F ':' '{print$5}' | awk -F '|' '{print$1}' | head -n 200 | tail -n 100 | awk '{sum+=$1} END {if (NR != 0) printf("%.1f",sum/NR)}'`
-FPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'`
-echo "Elapsed Time Per iteration: $STEP_TIME, Average FPS: $FPS"
\ No newline at end of file
+SPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'`
+echo "Elapsed Time Per iteration: $STEP_TIME, Average Samples per Second: $SPS"
\ No newline at end of file
diff --git a/examples/cogvideox/i2v_1.5/data.json b/examples/cogvideox/i2v_1.5/data.json
new file mode 100644
index 00000000..b15c5316
--- /dev/null
+++ b/examples/cogvideox/i2v_1.5/data.json
@@ -0,0 +1,45 @@
+{
+	"dataset_param": {
+		"dataset_type": "t2v",
+		"use_feature_data": false,
+		"basic_parameters": {
+			"data_path": "/data_path/data.jsonl",
+			"data_folder": "/data_path",
+			"data_storage_mode": "standard"
+		},
+		"preprocess_parameters": {
+            "data_process_type": "CogvideoX",
+			"video_reader_type": "decoder",
+            "fps": 8,
+            "skip_frame_num": 3,
+			"num_frames": 21,
+			"max_height": 768,
+			"max_width": 1360,
+			"dataloader_num_workers": 8,
+			"train_pipeline": {
+				"video": [],
+				"image": []
+			}
+		},
+		"use_text_processer": true,
+		"enable_text_preprocessing": false,
+		"model_max_length": 224,
+		"tokenizer_config": {
+			"hub_backend": "hf",
+			"autotokenizer_name": "T5Tokenizer",
+			"from_pretrained": "5b-cogvideo/tokenizer"
+		}
+	},
+	"dataloader_param": {
+		"dataloader_mode": "sampler",
+        "sampler_type": "SequentialSampler",
+		"batch_size": 1,
+		"num_workers": 8,
+		"shuffle": true,
+		"drop_last": true,
+		"pin_memory": true,
+		"group_frame": false,
+		"group_resolution": false,
+		"collate_param": {}
+	}
+}
\ No newline at end of file
diff --git a/examples/cogvideox/model_cogvideox_i2v.json b/examples/cogvideox/i2v_1.5/model_cogvideox_i2v_1.5.json
similarity index 91%
rename from examples/cogvideox/model_cogvideox_i2v.json
rename to examples/cogvideox/i2v_1.5/model_cogvideox_i2v_1.5.json
index 178672f8..df278af3 100644
--- a/examples/cogvideox/model_cogvideox_i2v.json
+++ b/examples/cogvideox/i2v_1.5/model_cogvideox_i2v_1.5.json
@@ -1,128 +1,130 @@
-{
-    "frames": 25,
-    "resolution": [480, 720],
-    "allow_tf32": true,
-    "allow_internal_format":false,
-    "load_video_features": false,
-    "load_text_features": false,
-    "task": "i2v", 
-    "predictor": {
-        "model_id": "satdit",
-        "from_pretrained": null,
-        "dtype": "bf16",
-        "num_layers": 42,
-        "num_heads": 48,
-        "head_dim": 64,
-        "in_channels": 32,
-        "out_channels": 16,
-        "dropout": 0.0,
-        "cross_attention_dim": null,
-        "attention_bias": true,
-        "input_size": [13, 60, 90],
-        "patch_size": [1, 2, 2],
-        "activation_fn": "gelu-approximate",
-        "num_embeds_ada_norm": 1000,
-        "norm_type": "qk_ln",
-        "norm_elementwise_affine": true,
-        "norm_eps": 1e-5,
-        "caption_channels": null,
-        "time_embed_dim": 512,
-        "text_length": 226,
-        "text_hidden_size": 4096,
-        "concat_text_embed": true,
-        "interpolation_scale": [1.0, 1.0, 1.0],
-        "learnable_pos_embed": true,
-        "use_rope": true
-    },
-    "diffusion": {
-        "model_id": "cogvideo_diffusion",
-        "sigma_sampler_config": {
-            "uniform_sampling": true,
-            "num_idx": 1000,
-            "discretization_config":{
-                "shift_scale": 1.0
-            }
-        },
-        "denoiser_config": {
-            "num_idx": 1000,
-            "quantize_c_noise": false,
-            "discretization_config":{
-                "shift_scale": 1.0
-            }
-        }
-    },
-    "text_encoder": {
-        "model_id": "T5",
-        "hub_backend": "hf",
-        "from_pretrained": "5b-cogvideo",
-        "dtype": "bf16",
-        "load_in_8bit": false,
-        "low_cpu_mem_usage": true,
-        "ucg_rate": 0.1
-    },
-   "ae": {
-        "model_id": "contextparallelcasualvae",
-        "from_pretrained": "3d-vae.pt",
-        "cp_size": 1,
-        "dtype": "bf16",
-        "z_channels": 16,
-        "conv_padding": 0,
-        "num_res_blocks": 3,
-        "hidden_size_mult": [1,2,2,4],
-        "encoder_attention": "",
-        "encoder_nonlinearity": "swish",
-        "encoder_conv_in": "ContextParallelCausalConv3d",
-        "encoder_conv_out": "ContextParallelCausalConv3d",
-        "encoder_mid_resnet": "ContextParallelResnetBlock3D",
-        "encoder_resnet_blocks": [
-            "ContextParallelResnetBlock3D",
-            "ContextParallelResnetBlock3D",
-            "ContextParallelResnetBlock3D",
-            "ContextParallelResnetBlock3D"
-        ],
-        "encoder_spatial_downsample": [
-            "DownSample3D",
-            "DownSample3D",
-            "DownSample3D",
-            ""
-        ],
-        "encoder_temporal_downsample": [
-            "",
-            "",
-            "",
-            ""
-        ],
-        "decoder_attention": "",
-        "decoder_nonlinearity": "swish",
-        "decoder_conv_in": "ContextParallelCausalConv3d",
-        "decoder_conv_out": "ContextParallelCausalConv3d",
-        "decoder_mid_resnet": "ContextParallelResnetBlock3D",
-        "decoder_resnet_blocks": [
-            "ContextParallelResnetBlock3D",
-            "ContextParallelResnetBlock3D",
-            "ContextParallelResnetBlock3D",
-            "ContextParallelResnetBlock3D"
-        ],
-        "decoder_spatial_upsample": [
-            "",
-            "Upsample3D",
-            "Upsample3D",
-            "Upsample3D"
-        ],
-        "decoder_temporal_upsample": [
-            "",
-            "",
-            "",
-            ""
-        ],
-        "encoder_gather_norm": true,
-        "decoder_gather_norm": true,
-        "use_quant_layer": false,
-        "i2v_processor": {
-            "processor_id": "cogvideox_i2v_processor",
-            "noised_image_all_concat": false,
-            "noised_image_dropout": 0.05,
-            "noised_image_input": true
-        }
-    }
+{
+    "frames": 21,
+    "resolution": [768, 1360],
+    "allow_tf32": true,
+    "allow_internal_format":false,
+    "load_video_features": false,
+    "load_text_features": false,
+    "task": "i2v", 
+    "predictor": {
+        "model_id": "satdit",
+        "from_pretrained": null,
+        "dtype": "bf16",
+        "num_layers": 2,
+        "num_heads": 48,
+        "head_dim": 64,
+        "in_channels": 32,
+        "out_channels": 16,
+        "dropout": 0.0,
+        "cross_attention_dim": null,
+        "attention_bias": true,
+        "input_size": [6, 96, 170],
+        "patch_type": "3D",
+        "patch_size": [2, 2, 2],
+        "activation_fn": "gelu-approximate",
+        "num_embeds_ada_norm": 1000,
+        "norm_type": "qk_ln",
+        "norm_elementwise_affine": true,
+        "norm_eps": 1e-5,
+        "caption_channels": null,
+        "time_embed_dim": 512,
+        "text_length": 224,
+        "text_hidden_size": 4096,
+        "concat_text_embed": true,
+        "interpolation_scale": [1.0, 1.0, 1.0],
+        "learnable_pos_embed": true,
+        "use_rope": true
+    },
+    "diffusion": {
+        "model_id": "cogvideo_diffusion",
+        "sigma_sampler_config": {
+            "uniform_sampling": true,
+            "num_idx": 1000,
+            "discretization_config":{
+                "shift_scale": 1.0
+            }
+        },
+        "denoiser_config": {
+            "num_idx": 1000,
+            "quantize_c_noise": false,
+            "discretization_config":{
+                "shift_scale": 1.0
+            }
+        }
+    },
+    "text_encoder": {
+        "model_id": "T5",
+        "hub_backend": "hf",
+        "from_pretrained": "5b-cogvideo",
+        "dtype": "bf16",
+        "load_in_8bit": false,
+        "low_cpu_mem_usage": true,
+        "ucg_rate": 0.1,
+        "use_attention_mask": false
+    },
+   "ae": {
+        "model_id": "contextparallelcasualvae",
+        "from_pretrained": "3d-vae.pt",
+        "cp_size": 1,
+        "dtype": "bf16",
+        "z_channels": 16,
+        "conv_padding": 0,
+        "num_res_blocks": 3,
+        "hidden_size_mult": [1,2,2,4],
+        "encoder_attention": "",
+        "encoder_nonlinearity": "swish",
+        "encoder_conv_in": "ContextParallelCausalConv3d",
+        "encoder_conv_out": "ContextParallelCausalConv3d",
+        "encoder_mid_resnet": "ContextParallelResnetBlock3D",
+        "encoder_resnet_blocks": [
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D"
+        ],
+        "encoder_spatial_downsample": [
+            "DownSample3D",
+            "DownSample3D",
+            "DownSample3D",
+            ""
+        ],
+        "encoder_temporal_downsample": [
+            "",
+            "",
+            "",
+            ""
+        ],
+        "decoder_attention": "",
+        "decoder_nonlinearity": "swish",
+        "decoder_conv_in": "ContextParallelCausalConv3d",
+        "decoder_conv_out": "ContextParallelCausalConv3d",
+        "decoder_mid_resnet": "ContextParallelResnetBlock3D",
+        "decoder_resnet_blocks": [
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D"
+        ],
+        "decoder_spatial_upsample": [
+            "",
+            "Upsample3D",
+            "Upsample3D",
+            "Upsample3D"
+        ],
+        "decoder_temporal_upsample": [
+            "",
+            "",
+            "",
+            ""
+        ],
+        "encoder_gather_norm": true,
+        "decoder_gather_norm": true,
+        "use_quant_layer": false,
+        "i2v_processor": {
+            "processor_id": "cogvideox_i2v_processor",
+            "noised_image_all_concat": false,
+            "noised_image_dropout": 0.05,
+            "noised_image_input": true
+        }
+    }
 }
\ No newline at end of file
diff --git a/examples/cogvideox/i2v_1.5/pretrain_cogvideox_i2v.sh b/examples/cogvideox/i2v_1.5/pretrain_cogvideox_i2v.sh
new file mode 100644
index 00000000..820299fa
--- /dev/null
+++ b/examples/cogvideox/i2v_1.5/pretrain_cogvideox_i2v.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+export ASCEND_GLOBAL_LOG_LEVEL=3
+export TASK_QUEUE_ENABLE=1
+export COMBINED_ENABLE=1
+export CPU_AFFINITY_CONF=1
+export HCCL_CONNECT_TIMEOUT=1200
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=29505
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+TP=1
+PP=1
+CP=1
+MBS=1
+GBS=$(($WORLD_SIZE*$MBS/$CP))
+
+MM_DATA="./examples/cogvideox/i2v_1.5/data.json"
+MM_MODEL="./examples/cogvideox/i2v_1.5/model_cogvideox_i2v_1.5.json"
+MM_TOOL="./mindspeed_mm/tools/tools.json"
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --context-parallel-size ${CP} \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --num-layers 1 \
+    --hidden-size 3072 \
+    --num-attention-heads 48 \
+    --seq-length 24 \
+    --max-position-embeddings 24 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --tokenizer-type NullTokenizer \
+    --vocab-size 0 \
+    --position-embedding-type rope \
+    --rotary-base 500000 \
+    --swiglu \
+    --no-masked-softmax-fusion \
+    --lr 1e-5 \
+    --min-lr 1e-5 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.95 \
+    --adam-eps 1e-8 \
+    --lr-decay-style constant \
+    --weight-decay 1e-4 \
+    --lr-warmup-init 1e-4 \
+    --lr-warmup-iters 0 \
+    --clip-grad 1.0 \
+    --train-iters 5000 \
+    --no-gradient-accumulation-fusion \
+    --no-load-optim \
+    --no-load-rng \
+    --no-save-optim \
+    --no-save-rng \
+    --bf16 \
+    --recompute-granularity full \
+    --recompute-method block \
+    --recompute-num-layers 42 \
+    --use-distributed-optimizer \
+    --overlap-grad-reduce \
+    --overlap-param-gather 
+"
+
+MM_ARGS="
+    --mm-data $MM_DATA \
+    --mm-model $MM_MODEL \
+    --mm-tool $MM_TOOL
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 10000 \
+    --eval-iters 10 \
+"
+
+logfile=$(date +%Y%m%d)_$(date +%H%M%S)
+mkdir -p logs
+torchrun $DISTRIBUTED_ARGS pretrain_sora.py \
+    $GPT_ARGS \
+    $MM_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl >> logs/train_${logfile}.log 2>&1
+
+chmod 440 logs/train_${logfile}.log
+STEP_TIME=`grep "elapsed time per iteration" logs/train_${logfile}.log | awk -F ':' '{print$5}' | awk -F '|' '{print$1}' | head -n 200 | tail -n 100 | awk '{sum+=$1} END {if (NR != 0) printf("%.1f",sum/NR)}'`
+SPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'`
+echo "Elapsed Time Per iteration: $STEP_TIME, Average Samples per Second: $SPS"
\ No newline at end of file
diff --git a/examples/cogvideox/t2v_1.0/data.json b/examples/cogvideox/t2v_1.0/data.json
new file mode 100644
index 00000000..6ecd74aa
--- /dev/null
+++ b/examples/cogvideox/t2v_1.0/data.json
@@ -0,0 +1,45 @@
+{
+	"dataset_param": {
+		"dataset_type": "t2v",
+		"use_feature_data": false,
+		"basic_parameters": {
+			"data_path": "/data_path/data.jsonl",
+			"data_folder": "/data_path",
+			"data_storage_mode": "standard"
+		},
+		"preprocess_parameters": {
+            "data_process_type": "CogvideoX",
+			"video_reader_type": "decoder",
+            "fps": 8,
+            "skip_frame_num": 3,
+			"num_frames": 25,
+			"max_height": 480,
+			"max_width": 720,
+			"dataloader_num_workers": 8,
+			"train_pipeline": {
+				"video": [],
+				"image": []
+			}
+		},
+		"use_text_processer": true,
+		"enable_text_preprocessing": false,
+		"model_max_length": 226,
+		"tokenizer_config": {
+			"hub_backend": "hf",
+			"autotokenizer_name": "T5Tokenizer",
+			"from_pretrained": "5b-cogvideo/tokenizer"
+		}
+	},
+	"dataloader_param": {
+		"dataloader_mode": "sampler",
+        "sampler_type": "SequentialSampler",
+		"batch_size": 1,
+		"num_workers": 8,
+		"shuffle": true,
+		"drop_last": true,
+		"pin_memory": true,
+		"group_frame": false,
+		"group_resolution": false,
+		"collate_param": {}
+	}
+}
\ No newline at end of file
diff --git a/examples/cogvideox/inference_cogvideox.sh b/examples/cogvideox/t2v_1.0/inference_cogvideox_t2v.sh
similarity index 95%
rename from examples/cogvideox/inference_cogvideox.sh
rename to examples/cogvideox/t2v_1.0/inference_cogvideox_t2v.sh
index 01a3cbbc..5204ddbf 100644
--- a/examples/cogvideox/inference_cogvideox.sh
+++ b/examples/cogvideox/t2v_1.0/inference_cogvideox_t2v.sh
@@ -14,7 +14,7 @@ CP=1
 MBS=1
 GBS=$(($WORLD_SIZE*$MBS/$CP/$TP))
 
-MM_MODEL="examples/cogvideox/inference_model.json"
+MM_MODEL="examples/cogvideox/t2v_1.0/inference_model_t2v.json"
 LOAD_PATH="your_converted_dit_ckpt_dir"
 
 DISTRIBUTED_ARGS="
diff --git a/examples/cogvideox/inference_model.json b/examples/cogvideox/t2v_1.0/inference_model_t2v.json
similarity index 99%
rename from examples/cogvideox/inference_model.json
rename to examples/cogvideox/t2v_1.0/inference_model_t2v.json
index 1927114c..8dfdb4e4 100644
--- a/examples/cogvideox/inference_model.json
+++ b/examples/cogvideox/t2v_1.0/inference_model_t2v.json
@@ -85,6 +85,7 @@
         "cross_attention_dim": null,
         "attention_bias": true,
         "input_size": [13, 60, 90],
+        "patch_type": "2D",
         "patch_size": [1, 2, 2],
         "activation_fn": "gelu-approximate",
         "num_embeds_ada_norm": 1000,
diff --git a/examples/cogvideox/model_cogvideox.json b/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json
similarity index 99%
rename from examples/cogvideox/model_cogvideox.json
rename to examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json
index 367f3f06..cfe12471 100644
--- a/examples/cogvideox/model_cogvideox.json
+++ b/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json
@@ -37,6 +37,7 @@
         "model_id": "cogvideo_diffusion",
         "sigma_sampler_config": {
             "uniform_sampling": true,
+            "group_num": 8,
             "num_idx": 1000,
             "discretization_config":{
                 "shift_scale": 1.0
diff --git a/examples/cogvideox/pretrain_cogvideox_t2v.sh b/examples/cogvideox/t2v_1.0/pretrain_cogvideox_t2v.sh
similarity index 88%
rename from examples/cogvideox/pretrain_cogvideox_t2v.sh
rename to examples/cogvideox/t2v_1.0/pretrain_cogvideox_t2v.sh
index 588aecb7..5226611a 100644
--- a/examples/cogvideox/pretrain_cogvideox_t2v.sh
+++ b/examples/cogvideox/t2v_1.0/pretrain_cogvideox_t2v.sh
@@ -7,6 +7,7 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 
 GPUS_PER_NODE=8
 MASTER_ADDR=localhost
@@ -21,8 +22,8 @@ CP=1
 MBS=1
 GBS=$(($WORLD_SIZE*$MBS/$CP))
 
-MM_DATA="./examples/cogvideox/data.json"
-MM_MODEL="./examples/cogvideox/model_cogvideox.json"
+MM_DATA="./examples/cogvideox/t2v_1.0/data.json"
+MM_MODEL="./examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json"
 MM_TOOL="./mindspeed_mm/tools/tools.json"
 
 DISTRIBUTED_ARGS="
@@ -69,6 +70,7 @@ GPT_ARGS="
     --no-save-optim \
     --no-save-rng \
     --bf16 \
+    --qk-layernorm \
     --recompute-granularity full \
     --recompute-method block \
     --recompute-num-layers 42 \
@@ -100,5 +102,5 @@ torchrun $DISTRIBUTED_ARGS pretrain_sora.py \
 
 chmod 440 logs/train_${logfile}.log
 STEP_TIME=`grep "elapsed time per iteration" logs/train_${logfile}.log | awk -F ':' '{print$5}' | awk -F '|' '{print$1}' | head -n 200 | tail -n 100 | awk '{sum+=$1} END {if (NR != 0) printf("%.1f",sum/NR)}'`
-FPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'`
-echo "Elapsed Time Per iteration: $STEP_TIME, Average FPS: $FPS"
\ No newline at end of file
+SPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'`
+echo "Elapsed Time Per iteration: $STEP_TIME, Average Samples per Second: $SPS"
\ No newline at end of file
diff --git a/examples/cogvideox/t2v_1.5/data.json b/examples/cogvideox/t2v_1.5/data.json
new file mode 100644
index 00000000..b15c5316
--- /dev/null
+++ b/examples/cogvideox/t2v_1.5/data.json
@@ -0,0 +1,45 @@
+{
+	"dataset_param": {
+		"dataset_type": "t2v",
+		"use_feature_data": false,
+		"basic_parameters": {
+			"data_path": "/data_path/data.jsonl",
+			"data_folder": "/data_path",
+			"data_storage_mode": "standard"
+		},
+		"preprocess_parameters": {
+            "data_process_type": "CogvideoX",
+			"video_reader_type": "decoder",
+            "fps": 8,
+            "skip_frame_num": 3,
+			"num_frames": 21,
+			"max_height": 768,
+			"max_width": 1360,
+			"dataloader_num_workers": 8,
+			"train_pipeline": {
+				"video": [],
+				"image": []
+			}
+		},
+		"use_text_processer": true,
+		"enable_text_preprocessing": false,
+		"model_max_length": 224,
+		"tokenizer_config": {
+			"hub_backend": "hf",
+			"autotokenizer_name": "T5Tokenizer",
+			"from_pretrained": "5b-cogvideo/tokenizer"
+		}
+	},
+	"dataloader_param": {
+		"dataloader_mode": "sampler",
+        "sampler_type": "SequentialSampler",
+		"batch_size": 1,
+		"num_workers": 8,
+		"shuffle": true,
+		"drop_last": true,
+		"pin_memory": true,
+		"group_frame": false,
+		"group_resolution": false,
+		"collate_param": {}
+	}
+}
\ No newline at end of file
diff --git a/examples/cogvideox/t2v_1.5/model_cogvideox_t2v_1.5.json b/examples/cogvideox/t2v_1.5/model_cogvideox_t2v_1.5.json
new file mode 100644
index 00000000..33e6059c
--- /dev/null
+++ b/examples/cogvideox/t2v_1.5/model_cogvideox_t2v_1.5.json
@@ -0,0 +1,123 @@
+{
+    "frames": 21,
+    "resolution": [768, 1360],
+    "allow_tf32": true,
+    "allow_internal_format":false,
+    "load_video_features": false,
+    "load_text_features": false,
+    "task": "t2v",
+    "predictor": {
+        "model_id": "satdit",
+        "from_pretrained": "mmdit.pt",
+        "dtype": "bf16",
+        "num_layers": 2,
+        "num_heads": 48,
+        "head_dim": 64,
+        "in_channels": 16,
+        "out_channels": 16,
+        "dropout": 0.0,
+        "cross_attention_dim": null,
+        "attention_bias": true,
+        "input_size": [6, 96, 170],
+        "patch_type": "3D",
+        "patch_size": [2, 2, 2],
+        "activation_fn": "gelu-approximate",
+        "num_embeds_ada_norm": 1000,
+        "norm_type": "qk_ln",
+        "norm_elementwise_affine": true,
+        "norm_eps": 1e-5,
+        "caption_channels": null,
+        "time_embed_dim": 512,
+        "text_length": 224,
+        "text_hidden_size": 4096,
+        "concat_text_embed": true,
+        "interpolation_scale": [1.0, 1.0, 1.0],
+        "use_rope": true
+    },
+    "diffusion": {
+        "model_id": "cogvideo_diffusion",
+        "sigma_sampler_config": {
+            "uniform_sampling": true,
+            "num_idx": 1000,
+            "discretization_config":{
+                "shift_scale": 1.0
+            }
+        },
+        "denoiser_config": {
+            "num_idx": 1000,
+            "quantize_c_noise": false,
+            "discretization_config":{
+                "shift_scale": 1.0
+            }
+        }
+    },
+    "text_encoder": {
+        "model_id": "T5",
+        "hub_backend": "hf",
+        "from_pretrained": "5b-cogvideo",
+        "dtype": "bf16",
+        "load_in_8bit": false,
+        "low_cpu_mem_usage": true,
+        "ucg_rate": 0.1,
+        "use_attention_mask": false
+    },
+   "ae": {
+        "model_id": "contextparallelcasualvae",
+        "from_pretrained": "3d-vae.pt",
+        "cp_size": 1,
+        "dtype": "bf16",
+        "z_channels": 16,
+        "conv_padding": 0,
+        "num_res_blocks": 3,
+        "hidden_size_mult": [1,2,2,4],
+        "encoder_attention": "",
+        "encoder_nonlinearity": "swish",
+        "encoder_conv_in": "ContextParallelCausalConv3d",
+        "encoder_conv_out": "ContextParallelCausalConv3d",
+        "encoder_mid_resnet": "ContextParallelResnetBlock3D",
+        "encoder_resnet_blocks": [
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D"
+        ],
+        "encoder_spatial_downsample": [
+            "DownSample3D",
+            "DownSample3D",
+            "DownSample3D",
+            ""
+        ],
+        "encoder_temporal_downsample": [
+            "",
+            "",
+            "",
+            ""
+        ],
+        "decoder_attention": "",
+        "decoder_nonlinearity": "swish",
+        "decoder_conv_in": "ContextParallelCausalConv3d",
+        "decoder_conv_out": "ContextParallelCausalConv3d",
+        "decoder_mid_resnet": "ContextParallelResnetBlock3D",
+        "decoder_resnet_blocks": [
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D"
+        ],
+        "decoder_spatial_upsample": [
+            "",
+            "Upsample3D",
+            "Upsample3D",
+            "Upsample3D"
+        ],
+        "decoder_temporal_upsample": [
+            "",
+            "",
+            "",
+            ""
+        ],
+        "encoder_gather_norm": true,
+        "decoder_gather_norm": true,
+        "use_quant_layer": false
+    }
+}
diff --git a/examples/cogvideox/t2v_1.5/pretrain_cogvideox_t2v_1.5.sh b/examples/cogvideox/t2v_1.5/pretrain_cogvideox_t2v_1.5.sh
new file mode 100644
index 00000000..3aa1c375
--- /dev/null
+++ b/examples/cogvideox/t2v_1.5/pretrain_cogvideox_t2v_1.5.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+export ASCEND_GLOBAL_LOG_LEVEL=3
+export TASK_QUEUE_ENABLE=1
+export COMBINED_ENABLE=1
+export CPU_AFFINITY_CONF=1
+export HCCL_CONNECT_TIMEOUT=1200
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+
+GPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=29505
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+TP=1
+PP=1
+CP=1
+MBS=1
+GBS=$(($WORLD_SIZE*$MBS/$CP))
+
+MM_DATA="./examples/cogvideox/t2v_1.5/data.json"
+MM_MODEL="./examples/cogvideox/t2v_1.5/model_cogvideox_t2v_1.5.json"
+MM_TOOL="./mindspeed_mm/tools/tools.json"
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --context-parallel-size ${CP} \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --num-layers 1 \
+    --hidden-size 3072 \
+    --num-attention-heads 48 \
+    --seq-length 24 \
+    --max-position-embeddings 24 \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --tokenizer-type NullTokenizer \
+    --vocab-size 0 \
+    --position-embedding-type rope \
+    --rotary-base 500000 \
+    --swiglu \
+    --no-masked-softmax-fusion \
+    --lr 1e-4 \
+    --min-lr 1e-4 \
+    --adam-beta1 0.9 \
+    --adam-beta2 0.999 \
+    --adam-eps 1e-8 \
+    --lr-decay-style constant \
+    --weight-decay 1e-2 \
+    --lr-warmup-init 1e-4 \
+    --lr-warmup-iters 500 \
+    --clip-grad 1.0 \
+    --train-iters 5000 \
+    --no-gradient-accumulation-fusion \
+    --no-load-optim \
+    --no-load-rng \
+    --no-save-optim \
+    --no-save-rng \
+    --bf16 \
+    --recompute-granularity full \
+    --recompute-method block \
+    --recompute-num-layers 42 \
+    --use-distributed-optimizer \
+    --overlap-grad-reduce \
+    --overlap-param-gather
+"
+
+MM_ARGS="
+    --mm-data $MM_DATA \
+    --mm-model $MM_MODEL \
+    --mm-tool $MM_TOOL
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 10000 \
+    --eval-iters 10 \
+"
+
+logfile=$(date +%Y%m%d)_$(date +%H%M%S)
+mkdir -p logs
+torchrun $DISTRIBUTED_ARGS pretrain_sora.py \
+    $GPT_ARGS \
+    $MM_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl >> logs/train_${logfile}.log 2>&1
+
+chmod 440 logs/train_${logfile}.log
+STEP_TIME=`grep "elapsed time per iteration" logs/train_${logfile}.log | awk -F ':' '{print$5}' | awk -F '|' '{print$1}' | head -n 200 | tail -n 100 | awk '{sum+=$1} END {if (NR != 0) printf("%.1f",sum/NR)}'`
+SPS=`awk 'BEGIN{printf "%.3f\n", '${GBS}'*1000/'${STEP_TIME}'}'`
+echo "Elapsed Time Per iteration: $STEP_TIME, Average Samples per Second: $SPS"
\ No newline at end of file
diff --git a/examples/diffusers/flux/README.md b/examples/diffusers/flux/README.md
index c33281ac..d6efbc65 100644
--- a/examples/diffusers/flux/README.md
+++ b/examples/diffusers/flux/README.md
@@ -264,7 +264,7 @@
         ```
 
         - 在文件上方的import栏增加`DistributedType`在`from accelerate import Acceleratore`后 （30行附近）
-        - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED` (1669/1788行附近)
+        - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED` (1669/1788行附近)，并在`if args.checkpoints_total_limit is not None`后增加`and accelerator.is_main_process`
 
         ```python
         from accelerate import Accelerator, DistributedType
@@ -272,6 +272,8 @@
 
         if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED:
         # if accelerator.is_main_process: # 原代码
+          if global_step % args.checkpointing_steps == 0:  # 原代码 不进行修改
+            if args.checkpoints_total_limit is not None and accelerator.is_main_process: # 添加
         ```
 
         Lora任务需调用patch任务进行权重保存：
@@ -391,6 +393,28 @@ vim infer_flux_text2img_bf16.py # 进入运行推理的Python文件
       ```shell
       python infer_flux_text2img_lora_bf16.py
       ```
+  
+  【分布式推理】
+
+  ```shell
+  vim infer_flux_text2img_distrib.py
+  ```
+
+- 修改模型权重路径 model_path为模型权重路径或微调后的权重路径
+- 如lora微调 可将lora_weights修改为Lora权重路径
+
+    ```python
+    model_path = "/black-forest-labs/FLUX.1-dev"  # 模型权重/微调权重路径
+    lora_weights = "/pytorch_lora_weights.safetensors"  # Lora权重路径
+    ```
+
+- 启动分布式推理脚本
+
+  - 因使用accelerate进行分布式推理，config可设置：`--num_processes=卡数`，`num_machines=机器数`等
+
+  ```shell
+  accelerate launch --num_processes=4 infer_flux_text2img_distrib.py # 单机四卡进行分布式推理
+  ```
 
 <a id="jump3"></a>
 
diff --git a/examples/diffusers/flux/infer_flux_text2img_distrib.py b/examples/diffusers/flux/infer_flux_text2img_distrib.py
new file mode 100644
index 00000000..680aadd1
--- /dev/null
+++ b/examples/diffusers/flux/infer_flux_text2img_distrib.py
@@ -0,0 +1,63 @@
+import os
+
+import torch
+from accelerate import PartialState
+from diffusers import FluxPipeline
+
+output_path = "./flux_lora_NPU"
+os.makedirs(output_path, exist_ok=True)
+
+MODEL_PATH = "/black-forest-labs/FLUX.1-dev"  # FLUX模型路径
+LORA_WEIGHTS = "./output/pytorch_lora_weights.safetensors"  # LoRA权重路径
+pipe = FluxPipeline.from_pretrained(
+    MODEL_PATH, torch_dtype=torch.bfloat16, local_files_only=True
+)
+
+if os.path.exists(LORA_WEIGHTS):
+    print(f"Loading LoRA weights from {LORA_WEIGHTS}")
+    pipe.load_lora_weights(LORA_WEIGHTS)
+else:
+    print("LoRA weights not found. Using the base model")
+
+distributed_state = PartialState()
+pipe.to(distributed_state.device)
+
+PROMPTS = [
+    "masterpiece, best quality, Cute dragon creature, pokemon style, night, moonlight, dim lighting",
+    "masterpiece, best quality, Pikachu walking in beijing city, pokemon style, night, moonlight, dim lighting",
+    "masterpiece, best quality, red panda , pokemon style, evening light, sunset, rim lighting",
+    "masterpiece, best quality, Photo of (Lion:1.2) on a couch, flower in vase, dof, film grain, crystal clear, pokemon style, dark studio",
+    "masterpiece, best quality, siberian cat pokemon on river, pokemon style, evening light, sunset, rim lighting, depth of field",
+    "masterpiece, best quality, pig, Exquisite City, (sky:1.3), (Miniature tree:1.3), Miniature object, many flowers, glowing mushrooms, (creek:1.3), lots of fruits, cute colorful animal protagonist, Firefly, meteor, Colorful cloud, pokemon style, Complicated background, rainbow,",
+    "masterpiece, best quality, (pokemon), a cute pikachu, girl with glasses, (masterpiece, top quality, best quality, official art, beautiful and aesthetic:1.2),",
+    "masterpiece, best quality, sugimori ken \(style\), (pokemon \(creature\)), pokemon electric type, grey and yellow skin, mechanical arms, cyberpunk city background, night, neon light",
+]
+# 设置随机数种子
+seed_list = [8, 23, 42, 1334]
+
+for i in seed_list:
+    generator = torch.Generator(device="npu").manual_seed(i)
+
+    with distributed_state.split_between_processes(PROMPTS) as prompts:
+        for prompt in prompts:
+            image = pipe(
+                prompt=prompt,
+                generator=generator,
+                num_inference_steps=28,
+                height=1024,
+                width=1024,
+                guidance_scale=1.0,
+            ).images
+
+            # Create name for the image
+            prompt_words = prompt.replace("masterpiece, best quality, ", "").split()[:3]
+            prompt_abbr = "_".join(prompt_words)
+
+            filename = (
+                f"{prompt_abbr}_seed{i}_rank{distributed_state.process_index}.png"
+            )
+            filename = "".join(
+                c for c in filename if c.isalnum() or c in "._-"
+            )  # remove special chars
+
+            image[0].save(f"{output_path}/{filename}")
diff --git a/examples/diffusers/flux/infer_flux_text2img_lora_bf16.py b/examples/diffusers/flux/infer_flux_text2img_lora_bf16.py
index eb4d78fd..0336e14a 100644
--- a/examples/diffusers/flux/infer_flux_text2img_lora_bf16.py
+++ b/examples/diffusers/flux/infer_flux_text2img_lora_bf16.py
@@ -1,4 +1,5 @@
 import os
+
 import torch
 from diffusers import AutoPipelineForText2Image
 
diff --git a/examples/diffusers/sd3/README.md b/examples/diffusers/sd3/README.md
index e32829e1..e4c43273 100644
--- a/examples/diffusers/sd3/README.md
+++ b/examples/diffusers/sd3/README.md
@@ -237,7 +237,7 @@ torch npu 与 CANN包参考链接：[安装包参考链接](https://support.huaw
     ```
 
     - 在文件上方的import栏增加`DistributedType`在`from accelerate import Acceleratore`后 （30行附近），并增加patch引用`from patch_sd3 import create_save_model_hook`
-    - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED`（dreambooth在1681行附近,lora在1833行附近）
+    - 在`if accelerator.is_main_process`后增加 `or accelerator.distributed_type == DistributedType.DEEPSPEED`（dreambooth在1681行附近,lora在1833行附近），并在`if args.checkpoints_total_limit is not None`后增加`and accelerator.is_main_process`
 
     ```python
     from accelerate import Accelerator, DistributedType
@@ -247,6 +247,8 @@ torch npu 与 CANN包参考链接：[安装包参考链接](https://support.huaw
      
     if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED:
     # if accelerator.is_main_process: # 原代码 1681/1833行附近
+      if global_step % args.checkpointing_steps == 0:  # 原代码 不进行修改
+        if args.checkpoints_total_limit is not None and accelerator.is_main_process: # 添加
     ```
 
     Lora任务需调用patch任务进行权重保存：
@@ -360,7 +362,7 @@ vim infer_sd3_img2img.py # 进入运行I2I推理的Python文件
       python infer_sd3_img2img.py   # 单卡推理，图生图
       ```
 
-  【lora微调FLUX模型推理】
+  【lora微调SD3模型推理】
 
   ```shell
   vim infer_sd3_text2img_lora.py
@@ -379,6 +381,28 @@ vim infer_sd3_img2img.py # 进入运行I2I推理的Python文件
       python infer_sd3_text2img_lora.py
       ```
 
+  【分布式推理】
+
+  ```shell
+  vim infer_sd3_text2img_distrib.py
+  ```
+
+- 修改模型权重路径 model_path为模型权重路径或微调后的权重路径
+- 如lora微调 可将lora_weights修改为Lora权重路径
+
+  ```python
+  model_path = "stabilityai/stable-diffusion-3.5-large"  # 模型权重/微调权重路径
+  lora_weights = "/pytorch_lora_weights.safetensors"  # Lora权重路径
+  ```
+
+- 启动分布式推理脚本
+  
+  - 因使用accelerate进行分布式推理，config可设置：`--num_processes=卡数`，`num_machines=机器数`等
+
+  ```shell
+  accelerate launch --num_processes=4 infer_sd3_text2img_distrib.py # 单机四卡进行分布式推理
+  ```
+
 ## 使用基线数据集进行评估
 
 ## 引用
diff --git a/examples/diffusers/sd3/infer_sd3_text2img_distrib.py b/examples/diffusers/sd3/infer_sd3_text2img_distrib.py
new file mode 100644
index 00000000..102480e7
--- /dev/null
+++ b/examples/diffusers/sd3/infer_sd3_text2img_distrib.py
@@ -0,0 +1,75 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+# Copyright 2024 Stability AI, The HuggingFace Team and The InstantX Team. All rights reserved.
+# Copyright 2024 Stability AI and The HuggingFace Team
+
+import os
+
+import torch
+from accelerate import PartialState
+from diffusers import StableDiffusion3Pipeline
+
+output_path = "./infer_result_lora"
+os.makedirs(output_path, exist_ok=True)
+
+MODEL_PATH = "stabilityai/stable-diffusion-3.5-large"  # 模型路径
+LORA_WEIGHTS = "./output/pytorch_lora_weights.safetensors"  # LoRA权重路径
+DTYPE = torch.float16  # 混精模式
+
+pipe = StableDiffusion3Pipeline.from_pretrained(
+    MODEL_PATH,
+    torch_dtype=DTYPE,
+    local_files_only=True,
+)
+
+if os.path.exists(LORA_WEIGHTS):
+    print(f"Loading LoRA weights from {LORA_WEIGHTS}")
+    pipe.load_lora_weights(LORA_WEIGHTS)
+else:
+    print("LoRA weights not found. Using the base model")
+
+distributed_state = PartialState()
+pipe.to(distributed_state.device)
+
+prompts = dict()
+prompts["masterpiece, best quality, Cute dragon creature, pokemon style, night, moonlight, dim lighting"] = "deformed, disfigured, underexposed, overexposed, rugged, (low quality), (normal quality),"
+prompts["masterpiece, best quality, Pikachu walking in beijing city, pokemon style, night, moonlight, dim lighting"] = "deformed, disfigured, underexposed, overexposed, (low quality), (normal quality),"
+prompts["masterpiece, best quality, red panda , pokemon style, evening light, sunset, rim lighting"] = "deformed, disfigured, underexposed, overexposed, (low quality), (normal quality),"
+prompts["masterpiece, best quality, Photo of (Lion:1.2) on a couch, flower in vase, dof, film grain, crystal clear, pokemon style, dark studio"] = "deformed, disfigured, underexposed, overexposed, "
+prompts["masterpiece, best quality, siberian cat pokemon on river, pokemon style, evening light, sunset, rim lighting, depth of field"] = "deformed, disfigured, underexposed, overexposed, "
+prompts["masterpiece, best quality, pig, Exquisite City, (sky:1.3), (Miniature tree:1.3), Miniature object, many flowers, glowing mushrooms, (creek:1.3), lots of fruits, cute colorful animal protagonist, Firefly, meteor, Colorful cloud, pokemon style, Complicated background, rainbow,"] = "Void background,black background,deformed, disfigured, underexposed, overexposed, "
+prompts["masterpiece, best quality, (pokemon), a cute pikachu, girl with glasses, (masterpiece, top quality, best quality, official art, beautiful and aesthetic:1.2),"] = "(low quality), (normal quality), (monochrome), lowres, extra fingers, fewer fingers, (watermark), "
+prompts["masterpiece, best quality, sugimori ken \(style\), (pokemon \(creature\)), pokemon electric type, grey and yellow skin, mechanical arms, cyberpunk city background, night, neon light"] = "(worst quality, low quality:1.4), watermark, signature, deformed, disfigured, underexposed, overexposed, "
+
+# 设置随机数种子
+seed_list = [8, 23, 42, 1334]
+
+# 输出图片
+for i in seed_list:
+    generator = torch.Generator(device="npu").manual_seed(i)
+
+    # Convert dictionary to list
+    prompt_list = list(prompts.keys())
+    negative_prompt_list = list(prompts.values())
+
+    with distributed_state.split_between_processes(
+        list(zip(prompt_list, negative_prompt_list))
+    ) as distributed_pairs:
+        for prompt, negative_prompt in distributed_pairs:
+            image = pipe(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                generator=generator,
+                num_inference_steps=28,
+                height=1024,
+                width=1024,
+                guidance_scale=1.0,
+            ).images
+
+            # Create name for the image
+            prompt_words = prompt.replace("masterpiece, best quality, ", "").split()[:3]
+            prompt_abbr = "_".join(prompt_words)
+
+            filename = f"{prompt_abbr}_seed{i}_rank{distributed_state.process_index}.png"
+            filename = "".join(c for c in filename if c.isalnum() or c in "._-") # remove special chars
+
+            image[0].save(f"{output_path}/{filename}")
diff --git a/examples/diffusers/sdxl/README.md b/examples/diffusers/sdxl/README.md
index cf48c13a..3d6de2b4 100644
--- a/examples/diffusers/sdxl/README.md
+++ b/examples/diffusers/sdxl/README.md
@@ -454,20 +454,44 @@ SDXL 在 **昇腾芯片** 和 **参考芯片** 上的性能对比：
   python sdxl/sdxl_img2img_infer.py              # 混精fp16 图生图微调任务推理
   ```
 
+【分布式推理】
+
+- 对`sdxl/sdxl_text2img_distrib_infer.py`文件进行修改
+
+  ```shell
+  vim sdxl/sdxl_text2img_distrib_infer.py
+  ```
+
+- 修改模型权重路径 model_path为模型权重路径或微调后的权重路径
+- 如lora微调 可将lora_weights修改为Lora权重路径
+
+  ```python
+  model_path = "/stabilityai/stable-diffusion-xl-base-1.0"  # 模型权重/微调权重路径
+  lora_weights = "/pytorch_lora_weights.safetensors"  # Lora权重路径
+  ```
+
+- 启动分布式推理脚本
+
+  - 因使用accelerate进行分布式推理，config可设置：`--num_processes=卡数`，`num_machines=机器数`等
+
+  ```shell
+  accelerate launch --num_processes=4 sdxl/sdxl_text2img_distrib_infer.py # 单机四卡进行分布式推理
+  ```
+
 <a id="jump4"></a>
 
 ### 性能
 
 | 芯片 | 卡数 |     任务     |  E2E（it/s）  |  AMP_Type | Torch_Version | deepspeed |
 |:---:|:---:|:----------:|:-----:|:---:|:---:|:---:|
-| 竞品A | 8p |    文生图lora    | 1.45 |  fp16 | 2.1 | ✔ |
+| 竞品A | 1p |    文生图lora    | 1.45 |  fp16 | 2.1 | ✔ |
 | Atlas 900 A2 PODc |8p |    文生图lora    | 2.61 |  fp16 | 2.1 | ✔ |
-| 竞品A | 8p | 文生图controlnet | 1.41  |  fp16 | 2.1 | ✔ |
-| Atlas 900 A2 PODc |8p | 文生图controlnet | 2.97 |  fp16 | 2.1 | ✔ |
-| 竞品A | 8p |  文生图全参  | 1.55 | fp16 | 2.1 | ✔ |
-| Atlas 900 A2 PODc |8p |  文生图全参  | 3.02 | fp16 | 2.1 | ✔ |
-| 竞品A | 8p |  图生图  | 3.56 | fp16 | 2.1 | ✔ |
-| Atlas 900 A2 PODc |8p |  图生图  | 3.94 | fp16 | 2.1 | ✔ |
+| 竞品A | 1p | 文生图controlnet | 1.41  |  fp16 | 2.1 | ✔ |
+| Atlas 900 A2 PODc |1p | 文生图controlnet | 2.97 |  fp16 | 2.1 | ✔ |
+| 竞品A | 1p |  文生图全参  | 1.55 | fp16 | 2.1 | ✔ |
+| Atlas 900 A2 PODc |1p |  文生图全参  | 3.02 | fp16 | 2.1 | ✔ |
+| 竞品A | 1p |  图生图  | 3.56 | fp16 | 2.1 | ✔ |
+| Atlas 900 A2 PODc |1p |  图生图  | 3.94 | fp16 | 2.1 | ✔ |
 
 ## 引用
 
diff --git a/examples/diffusers/sdxl/sdxl_text2img_distrib_infer.py b/examples/diffusers/sdxl/sdxl_text2img_distrib_infer.py
new file mode 100644
index 00000000..0c99c38c
--- /dev/null
+++ b/examples/diffusers/sdxl/sdxl_text2img_distrib_infer.py
@@ -0,0 +1,72 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+
+
+import random
+import os
+from diffusers import DiffusionPipeline
+import torch
+import torch_npu
+from accelerate import PartialState
+from torch_npu.contrib import transfer_to_npu
+import numpy as np
+
+output_path = "./sdxl_lora_NPU"
+os.makedirs(output_path, exist_ok=True)
+
+model_path = "/stabilityai/stable-diffusion-xl-base-1.0"  # Path for base model
+lora_weights = "/pytorch_lora_weights.safetensors"  # Path for LoRA weights
+
+pipe = DiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float32, local_files_only=True)
+
+if os.path.exists(lora_weights):
+    print(f"Loading LoRA weights from {lora_weights}")
+    pipe.load_lora_weights(lora_weights)
+else:
+    print("LoRA weights not found. Using the base model")
+
+distributed_state = PartialState()
+pipe.to(distributed_state.device)
+
+prompts = dict()
+prompts["masterpiece, best quality, Cute dragon creature, pokemon style, night, moonlight, dim lighting"] = "deformed, disfigured, underexposed, overexposed, rugged, (low quality), (normal quality),"
+prompts["masterpiece, best quality, Pikachu walking in beijing city, pokemon style, night, moonlight, dim lighting"] = "deformed, disfigured, underexposed, overexposed, (low quality), (normal quality),"
+prompts["masterpiece, best quality, red panda , pokemon style, evening light, sunset, rim lighting"] = "deformed, disfigured, underexposed, overexposed, (low quality), (normal quality),"
+prompts["masterpiece, best quality, Photo of (Lion:1.2) on a couch, flower in vase, dof, film grain, crystal clear, pokemon style, dark studio"] = "deformed, disfigured, underexposed, overexposed, "
+prompts["masterpiece, best quality, siberian cat pokemon on river, pokemon style, evening light, sunset, rim lighting, depth of field"] = "deformed, disfigured, underexposed, overexposed, "
+prompts["masterpiece, best quality, pig, Exquisite City, (sky:1.3), (Miniature tree:1.3), Miniature object, many flowers, glowing mushrooms, (creek:1.3), lots of fruits, cute colorful animal protagonist, Firefly, meteor, Colorful cloud, pokemon style, Complicated background, rainbow,"] = "Void background,black background,deformed, disfigured, underexposed, overexposed, "
+prompts["masterpiece, best quality, (pokemon), a cute pikachu, girl with glasses, (masterpiece, top quality, best quality, official art, beautiful and aesthetic:1.2),"] = "(low quality), (normal quality), (monochrome), lowres, extra fingers, fewer fingers, (watermark), "
+prompts["masterpiece, best quality, sugimori ken \(style\), (pokemon \(creature\)), pokemon electric type, grey and yellow skin, mechanical arms, cyberpunk city background, night, neon light"] = "(worst quality, low quality:1.4), watermark, signature, deformed, disfigured, underexposed, overexposed, "
+#设置随机数种子
+seed_list = [8, 23, 42, 1334]
+
+# 输出图片
+for i in seed_list:
+    generator = torch.Generator(device="npu").manual_seed(i)
+
+    # Convert dictionary to list
+    prompt_list = list(prompts.keys())
+    negative_prompt_list = list(prompts.values())
+
+    with distributed_state.split_between_processes(
+        list(zip(prompt_list, negative_prompt_list))
+    ) as distributed_pairs:
+        for prompt, negative_prompt in distributed_pairs:
+            image = pipe(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                generator=generator,
+                num_inference_steps=28,
+                height=1024,
+                width=1024,
+                guidance_scale=1.0,
+            ).images
+
+            # Create name for the image
+            prompt_words = prompt.replace("masterpiece, best quality, ", "").split()[:3]
+            prompt_abbr = "_".join(prompt_words)
+
+            filename = f"{prompt_abbr}_seed{i}_rank{distributed_state.process_index}.png"
+            filename = "".join(c for c in filename if c.isalnum() or c in "._-") # remove special chars
+
+            image[0].save(f"{output_path}/{filename}")
diff --git a/examples/internvl2/README.md b/examples/internvl2/README.md
index db89a79b..b8bc5af1 100644
--- a/examples/internvl2/README.md
+++ b/examples/internvl2/README.md
@@ -20,7 +20,10 @@
   - [准备工作](#jump5.1)
   - [配置参数](#jump5.2)
   - [启动推理](#jump5.3)
-
+- [评测](#jump6)
+  - [数据集准备](#jump6.1)
+  - [配置参数](#jump6.2)
+  - [启动评测](#jump6.3)
 ---
 <a id="jump1"></a>
 
@@ -102,7 +105,7 @@ torch npu 与 CANN包参考链接：[安装包参考链接](https://support.huaw
     git clone https://gitee.com/ascend/MindSpeed.git
     cd MindSpeed
     # checkout commit from MindSpeed core_r0.6.0
-    git checkout 4c6847e6fda0a458914fd2ea664f6d09a8be300e
+    git checkout ab39de78be23e88e2c8b0d25edf6135940990c02
     pip install -r requirements.txt
     pip3 install -e .
     cd ..
@@ -131,15 +134,15 @@ torch npu 与 CANN包参考链接：[安装包参考链接](https://support.huaw
 
 #### 2. 权重转换
 
-MindSpeeed-MM修改了部分原始网络的结构名称，使用`examples/internvl2/internvl_convert_to_mm_ckpt.py`脚本对原始预训练权重进行转换。该脚本实现了从huggingface权重到MindSpeed-MM权重的转换以及PP（Pipeline Parallel）权重的切分。
+MindSpeeed-MM修改了部分原始网络的结构名称，使用`examples/internvl2/internvl2_convert_to_mm_ckpt.py`脚本对原始预训练权重进行转换。该脚本实现了从huggingface权重到MindSpeed-MM权重的转换以及PP（Pipeline Parallel）权重的切分。
 
-以InternVL2-8B为例，`inernvl_convert_to_mm_ckpt.py`的入参`model-size`、`load-dir`、`save-dir`、`trust-remote-code`等如下：
+以InternVL2-8B为例，`internvl2_convert_to_mm_ckpt.py`的入参`model-size`、`load-dir`、`save-dir`、`trust-remote-code`等如下：
 
 启动脚本
 ```shell
   # 根据实际情况修改 ascend-toolkit 路径
   source /usr/local/Ascend/ascend-toolkit/set_env.sh
-  python examples/internvl2/internvl_convert_to_mm_ckpt.py \
+  python examples/internvl2/internvl2_convert_to_mm_ckpt.py \
     --model-size 8B \
     --load-dir raw_ckpt/InternVL2-8B \    # huggingface权重目录
     --save-dir pretrained/InternVL2-8B \  # 转换后的权重保存目录
@@ -260,12 +263,12 @@ $save_dir
 ```shell
     # 根据实际情况修改 ascend-toolkit 路径
     source /usr/local/Ascend/ascend-toolkit/set_env.sh
-    GPUS_PER_NODE=8
+    NPUS_PER_NODE=8
     MASTER_ADDR=locahost
     MASTER_PORT=6000
     NNODES=1
     NODE_RANK=0
-    WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
+    WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES))
 ```
 
 <a id="jump4.3"></a>
@@ -289,7 +292,7 @@ $save_dir
 ```shell
   # 根据实际情况修改 ascend-toolkit 路径
   source /usr/local/Ascend/ascend-toolkit/set_env.sh
-  python examples/internvl2/internvl_convert_to_mm_ckpt.py \
+  python examples/internvl2/internvl2_convert_to_mm_ckpt.py \
     --model-size 8B \
     --load-dir raw_ckpt/InternVL2-8B \    # huggingface权重目录
     --save-dir pretrained/InternVL2-8B \  # 转换后的权重保存目录
@@ -331,3 +334,66 @@ $save_dir
 ```shell
   bash examples/internvl2/inference_internvl.sh
 ```
+<a id="jump6"></a>
+
+## 评测
+<a id="jump6.1"></a>
+### 数据集准备
+
+当前模型支持AI2D(test)、ChartQA(test)、Docvqa(val)、MMMU(val)四种数据集的评测。
+数据集参考下载链接：
+
+- [MMMU_DEV_VAL](https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv)
+- [DocVQA_VAL](https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv)
+- [AI2D_TEST](https://opencompass.openxlab.space/utils/VLMEval/AI2D_TEST.tsv)
+- [ChartQA_TEST](https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv)
+<a id="jump6.2"></a>
+### 参数配置
+如果要进行评测需要将要评测的数据集名称和路径传到examples/internvl2/evaluate_internvl2_8B.json
+需要更改的字段有
+
+- `from_pretrained` 需要改为模型的权重文件的路径，如果使用的是huggingface的权重则需要进行权重转换(参考前面的权重转换的章节)，如果使用MindSpeed-MM训练出的则不需要进行权重转换。
+- `dataset_path` 需要填入上面下载的数据集文件路径。
+- `evaluation_dataset` 为评测数据集的名称可选的名称有(`ai2d_test`、`mmmu_dev_val`、`docvqa_val`、`chartqa_test`)， **注意**：需要与上面的数据集路径相对应。
+- `result_output_path` 为评测结果的输出路径，**注意**：每次评测前需要将之前保存在该路径下评测文件删除。
+- `tokenizer`下面的`from_pretrained`为huggingface下载的InternVL2-8B权重路径。
+
+```json
+    "model_id": "InternVLPipeline",
+    "from_pretrained": "./internvl8b_mm/release/mp_rank_00/model_optim_rng.pt",
+    "dataset_path": "./AI2D_TEST.tsv",
+    "evaluation_dataset":"ai2d_test",
+    "evaluation_model":"internvl2_8b",
+    "result_output_path":"./evaluation_outputs/",
+
+    "tokenizer":{
+        "hub_backend": "hf",
+        "autotokenizer_name": "AutoTokenizer",
+        "from_pretrained": "./InternVL2-8B",
+        "model_max_length": 4096,
+        "add_eos_token": false,
+        "trust_remote_code": true,
+        "use_fast": false
+    }
+
+```
+
+examples/internvl2/evaluate_internvl2_8B.json改完后，需要将json文件的路径传入到examples/internvl2/evaluate_internvl2_8B.sh MM_MODEL字段中
+
+```shell
+MM_MODEL=examples/internvl2/evaluate_internvl2_8B.json
+```
+评测支持多卡DP推理需要更改的配置,为NPU卡数量
+
+```shell
+NPUS_PER_NODE=1
+```
+<a id="jump6.3"></a>
+### 启动评测
+启动shell开始推理
+```shell
+bash examples/internvl2/evaluate_internvl2_8B.sh
+```
+评测结果会输出到`result_output_path`路径中，会输出结果文件：
+- *.xlsx文件，这个文件会输出每道题的预测结果和答案等详细信息。
+- *.csv文件，这个文件会输出统计准确率等数据。
\ No newline at end of file
diff --git a/examples/internvl2/dot_product_attention.py b/examples/internvl2/dot_product_attention.py
index 4ace9800..e38e5596 100644
--- a/examples/internvl2/dot_product_attention.py
+++ b/examples/internvl2/dot_product_attention.py
@@ -103,132 +103,59 @@ def dot_product_attention_forward(
 ):
     args = get_args()
 
-    if not torch.any(attention_mask):
-        if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1:
-            key = key.repeat_interleave(
-                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
-            )
-            value = value.repeat_interleave(
-                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
-            )
-
-        seq_length, batch_size, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3]
-
-        query, key, value = [rearrange(x, 's b h d -> (s b) h d') for x in [query, key, value]]
-
-        scale = 1.0 / math.sqrt(
-            self.hidden_size_per_attention_head) if self.scale_mask_softmax.scale is None else self.softmax_scale
-
-        if args.context_parallel_size > 1 and args.context_parallel_algo in ['megatron_cp_algo', 'hybrid_cp_algo']:
-            in_hybrid_mode = False
-            if get_context_parallel_group_for_hybrid_ring(check_initialized=False) is not None:
-                in_hybrid_mode = True
-
-            if not in_hybrid_mode:
-                cp_group = mpu.get_context_parallel_group()
-                cp_size = mpu.get_context_parallel_world_size()
-                rank = mpu.get_context_parallel_rank()
-                cp_global_ranks = mpu.get_context_parallel_global_ranks()
-            else:
-                cp_group = get_context_parallel_group_for_hybrid_ring()
-                cp_size = get_context_parallel_for_hybrid_ring_world_size()
-                rank = get_context_parallel_for_hybrid_ring_rank()
-                cp_global_ranks = get_context_parallel_for_hybrid_ring_global_ranks()
-
-            cp_para = dict()
-            cp_para['causal'] = args.cp_attention_mask_type == 'causal'
-            cp_para['cp_group'] = cp_group
-            cp_para['cp_size'] = cp_size
-            cp_para['rank'] = rank
-            cp_para['cp_global_ranks'] = cp_global_ranks
-            cp_para['cp_group_for_send_recv_overlap'] = mpu.get_context_parallel_group_for_send_recv_overlap() \
-                if args.use_cp_send_recv_overlap else None
-            cp_para['pse'] = self.pse
-            cp_para['pse_type'] = self.pse_type
-            output = ringattn_context_parallel(query, key, value, n_head, cp_para, scale, attention_mask,
-                                               self.attention_dropout.p)
+    seq_length, batch_size, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3]
+
+    query, key, value = [x.transpose(0, 1) for x in [query, key, value]]
+
+    scale = 1.0 / math.sqrt(self.hidden_size_per_attention_head) if self.scale_mask_softmax.scale is None else self.softmax_scale
+
+    if args.context_parallel_size > 1 and args.context_parallel_algo in ['megatron_cp_algo', 'hybrid_cp_algo']:
+        in_hybrid_mode = False
+        if get_context_parallel_group_for_hybrid_ring(check_initialized=False) is not None:
+            in_hybrid_mode = True
+
+        if not in_hybrid_mode:
+            cp_group = mpu.get_context_parallel_group()
+            cp_size = mpu.get_context_parallel_world_size()
+            rank = mpu.get_context_parallel_rank()
+            cp_global_ranks = mpu.get_context_parallel_global_ranks()
         else:
-            if args.use_fusion_attn_v2:
-                output = npu_fusion_attention(
-                    query, key, value, n_head, 'SBH',
-                    pse=self.pse,
-                    padding_mask=None,
-                    atten_mask=attention_mask,
-                    scale=scale,
-                    pse_type=self.pse_type,
-                    pre_tokens=args.pre_tockens,
-                    next_tokens=args.next_tockens,
-                    keep_prob=1 - self.dropout_p,
-                    inner_precise=0,
-                    sparse_mode=args.sparse_mode
-                )[0]
-            else:
-                cu_seqlens = tuple(
-                    torch.arange(seq_length, (batch_size + 1) * seq_length, step=seq_length, dtype=torch.int32).numpy().tolist())
-                output = torch_npu.npu_fusion_attention(
-                    query, key, value, head_num=n_head, input_layout="TND",
-                    keep_prob=1. - self.attention_dropout.p,
-                    actual_seq_qlen=cu_seqlens, actual_seq_kvlen=cu_seqlens,
-                    scale=scale,
-                )[0]
-                output = output.reshape(batch_size, seq_length, n_head, head_dim).contiguous()
-                output = output.reshape(batch_size, seq_length, -1).transpose(0, 1)
-
-        return output
+            cp_group = get_context_parallel_group_for_hybrid_ring()
+            cp_size = get_context_parallel_for_hybrid_ring_world_size()
+            rank = get_context_parallel_for_hybrid_ring_rank()
+            cp_global_ranks = get_context_parallel_for_hybrid_ring_global_ranks()
+
+        cp_para = dict()
+        cp_para['causal'] = args.cp_attention_mask_type == 'causal'
+        cp_para['cp_group'] = cp_group
+        cp_para['cp_size'] = cp_size
+        cp_para['rank'] = rank
+        cp_para['cp_global_ranks'] = cp_global_ranks
+        cp_para['cp_group_for_send_recv_overlap'] = mpu.get_context_parallel_group_for_send_recv_overlap() \
+            if args.use_cp_send_recv_overlap else None
+        cp_para['pse'] = self.pse
+        cp_para['pse_type'] = self.pse_type
+        output = ringattn_context_parallel(query, key, value, n_head, cp_para, scale, attention_mask, self.attention_dropout.p)
     else:
-        seq_length, batch_size, n_head, head_dim = query.shape[0], query.shape[1], query.shape[2], query.shape[3]
-
-        query, key, value = [x.transpose(0, 1) for x in [query, key, value]]
-
-        scale = 1.0 / math.sqrt(self.hidden_size_per_attention_head) if self.scale_mask_softmax.scale is None else self.softmax_scale
-
-        if args.context_parallel_size > 1 and args.context_parallel_algo in ['megatron_cp_algo', 'hybrid_cp_algo']:
-            in_hybrid_mode = False
-            if get_context_parallel_group_for_hybrid_ring(check_initialized=False) is not None:
-                in_hybrid_mode = True
-
-            if not in_hybrid_mode:
-                cp_group = mpu.get_context_parallel_group()
-                cp_size = mpu.get_context_parallel_world_size()
-                rank = mpu.get_context_parallel_rank()
-                cp_global_ranks = mpu.get_context_parallel_global_ranks()
-            else:
-                cp_group = get_context_parallel_group_for_hybrid_ring()
-                cp_size = get_context_parallel_for_hybrid_ring_world_size()
-                rank = get_context_parallel_for_hybrid_ring_rank()
-                cp_global_ranks = get_context_parallel_for_hybrid_ring_global_ranks()
-
-            cp_para = dict()
-            cp_para['causal'] = args.cp_attention_mask_type == 'causal'
-            cp_para['cp_group'] = cp_group
-            cp_para['cp_size'] = cp_size
-            cp_para['rank'] = rank
-            cp_para['cp_global_ranks'] = cp_global_ranks
-            cp_para['cp_group_for_send_recv_overlap'] = mpu.get_context_parallel_group_for_send_recv_overlap() \
-                if args.use_cp_send_recv_overlap else None
-            cp_para['pse'] = self.pse
-            cp_para['pse_type'] = self.pse_type
-            output = ringattn_context_parallel(query, key, value, n_head, cp_para, scale, attention_mask, self.attention_dropout.p)
+        if args.use_fusion_attn_v2:
+            output = npu_fusion_attention(
+                query, key, value, n_head, 'SBH',
+                pse=self.pse,
+                padding_mask=None,
+                atten_mask=attention_mask,
+                scale=scale,
+                pse_type=self.pse_type,
+                pre_tokens=args.pre_tockens,
+                next_tokens=args.next_tockens,
+                keep_prob=1 - self.dropout_p,
+                inner_precise=0,
+                sparse_mode=args.sparse_mode
+            )[0]
         else:
-            if args.use_fusion_attn_v2:
-                output = npu_fusion_attention(
-                    query, key, value, n_head, 'SBH',
-                    pse=self.pse,
-                    padding_mask=None,
-                    atten_mask=attention_mask,
-                    scale=scale,
-                    pse_type=self.pse_type,
-                    pre_tokens=args.pre_tockens,
-                    next_tokens=args.next_tockens,
-                    keep_prob=1 - self.dropout_p,
-                    inner_precise=0,
-                    sparse_mode=args.sparse_mode
-                )[0]
-            else:
-                output = torch_npu.npu_fusion_attention(query, key, value, n_head, "BSND",
-                                                             keep_prob=1. - self.attention_dropout.p,
-                                                             scale=scale,
-                                                             atten_mask=attention_mask, )[0]
-                output = output.transpose(0, 1).reshape(seq_length, batch_size, -1)
-
-        return output
+            output = torch_npu.npu_fusion_attention(query, key, value, n_head, "BSND",
+                                                            keep_prob=1. - self.attention_dropout.p,
+                                                            scale=scale,
+                                                            atten_mask=attention_mask, )[0]
+        output = output.transpose(0, 1).reshape(seq_length, batch_size, -1)
+
+    return output
diff --git a/examples/internvl2/evaluate_internvl2_8B.sh b/examples/internvl2/evaluate_internvl2_8B.sh
index 277b6c2b..847b4388 100644
--- a/examples/internvl2/evaluate_internvl2_8B.sh
+++ b/examples/internvl2/evaluate_internvl2_8B.sh
@@ -8,7 +8,6 @@ export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=7200
 export HCCL_EXEC_TIMEOUT=7200
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 export TOKENIZERS_PARALLELISM=false
 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
diff --git a/examples/internvl2/finetune_internvl2_2B.sh b/examples/internvl2/finetune_internvl2_2B.sh
index bab2cff6..40b1a704 100644
--- a/examples/internvl2/finetune_internvl2_2B.sh
+++ b/examples/internvl2/finetune_internvl2_2B.sh
@@ -7,15 +7,14 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 
-GPUS_PER_NODE=8
+NPUS_PER_NODE=8
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 MBS=1
 GRAD_ACC_STEP=16
@@ -38,7 +37,7 @@ MM_ARGS="
 "
 
 DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
+    --nproc_per_node $NPUS_PER_NODE \
     --nnodes $NNODES \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
diff --git a/examples/internvl2/finetune_internvl2_76B.sh b/examples/internvl2/finetune_internvl2_76B.sh
index 0634e8ba..ffee38ad 100644
--- a/examples/internvl2/finetune_internvl2_76B.sh
+++ b/examples/internvl2/finetune_internvl2_76B.sh
@@ -7,10 +7,9 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 
-GPUS_PER_NODE=16
+NPUS_PER_NODE=16
 MASTER_PORT=6000
 HOSTFILE='./hostfile'
 NODEADDR=$(hostname -I | awk -F " " '{print$1}')
@@ -18,7 +17,7 @@ NODE_RANK=$(awk '{ranks[$1]=(FNR-1);}END{print ranks["'$NODEADDR'"];}' $HOSTFILE
 NNODES=$(wc -l $HOSTFILE)
 MASTER_ADDR=$(head -n 1 $HOSTFILE | awk '{print $1;}')
 
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 MBS=1
 GRAD_ACC_STEP=128
@@ -41,7 +40,7 @@ MM_ARGS="
 "
 
 DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
+    --nproc_per_node $NPUS_PER_NODE \
     --nnodes $NNODES \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
diff --git a/examples/internvl2/finetune_internvl2_8B.sh b/examples/internvl2/finetune_internvl2_8B.sh
index f39f06de..071877ce 100644
--- a/examples/internvl2/finetune_internvl2_8B.sh
+++ b/examples/internvl2/finetune_internvl2_8B.sh
@@ -7,15 +7,14 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 
-GPUS_PER_NODE=8
+NPUS_PER_NODE=8
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 MBS=1
 GRAD_ACC_STEP=64
@@ -38,7 +37,7 @@ MM_ARGS="
 "
 
 DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
+    --nproc_per_node $NPUS_PER_NODE \
     --nnodes $NNODES \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
diff --git a/examples/internvl2/finetune_internvl2_8B_vpp.sh b/examples/internvl2/finetune_internvl2_8B_vpp.sh
index 21c3e28f..5dc5f301 100644
--- a/examples/internvl2/finetune_internvl2_8B_vpp.sh
+++ b/examples/internvl2/finetune_internvl2_8B_vpp.sh
@@ -7,15 +7,14 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 
-GPUS_PER_NODE=8
+NPUS_PER_NODE=8
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 MBS=1
 GRAD_ACC_STEP=64
@@ -39,7 +38,7 @@ MM_ARGS="
 "
 
 DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
+    --nproc_per_node $NPUS_PER_NODE \
     --nnodes $NNODES \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
diff --git a/examples/internvl2/inference_internvl.sh b/examples/internvl2/inference_internvl.sh
index 958b6975..bd132002 100644
--- a/examples/internvl2/inference_internvl.sh
+++ b/examples/internvl2/inference_internvl.sh
@@ -7,15 +7,14 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 
-GPUS_PER_NODE=1
+NPUS_PER_NODE=1
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 MBS=1
 GRAD_ACC_STEP=1
@@ -32,7 +31,7 @@ MM_ARGS="
 "
 
 DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
+    --nproc_per_node $NPUS_PER_NODE \
     --nnodes $NNODES \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
diff --git a/examples/internvl2/internvl2_convert_mm_to_hg.py b/examples/internvl2/internvl2_convert_mm_to_hf.py
similarity index 92%
rename from examples/internvl2/internvl2_convert_mm_to_hg.py
rename to examples/internvl2/internvl2_convert_mm_to_hf.py
index 3f6c1991..152c6763 100644
--- a/examples/internvl2/internvl2_convert_mm_to_hg.py
+++ b/examples/internvl2/internvl2_convert_mm_to_hf.py
@@ -1,10 +1,11 @@
 import argparse
-import os
 import json
+import os
 from pathlib import Path
+from dataclasses import dataclass
+
 import torch
 from safetensors.torch import save_file
-from dataclasses import dataclass
 
 
 @dataclass
@@ -68,11 +69,9 @@ def check_pp_config(_model_config_dict=None):
 
 
 def merge_by_pp(pp_ckpt_file, pp_rank: int, _model_config_dict=None):
-    # vit: [24, 0, 0, 0]
-    # llm: [6, 9, 9, 8]
     _vit_pipeline_num_layers = _model_config_dict.vit_pipeline_num_layers
     _llm_pipeline_num_layers = _model_config_dict.llm_pipeline_num_layers
-    
+
     vit_pp_start_index = 0
     llm_pp_start_index = 0
     if pp_rank > 0:
@@ -132,8 +131,8 @@ def split_qkv(wqkv, hn=64, ng=8):
     return wq, wk, wv
 
 
-def convert_mm_to_hg(_mm_state_dict, _model_config_dict=None):
-    _hg_state_dict = {}
+def convert_mm_to_hf(_mm_state_dict, _model_config_dict=None):
+    _hf_state_dict = {}
     # check LlamaForCausalLM or InternLM2ForCausalLM
     architectures_key = "text_decoder.decoder.layers.0.self_attention.linear_qkv.weight"
     is_llama_for_causa_llm = True
@@ -189,7 +188,7 @@ def convert_mm_to_hg(_mm_state_dict, _model_config_dict=None):
             new_key = new_key.replace('linear_fc2', '3')
 
         print(f'mapping {key} to {new_key}')
-        _hg_state_dict[new_key] = value
+        _hf_state_dict[new_key] = value
 
     if is_llama_for_causa_llm:
         for i in range(_model_config_dict.llm_num_layers):
@@ -198,17 +197,17 @@ def convert_mm_to_hg(_mm_state_dict, _model_config_dict=None):
             v_name = f'language_model.model.layers.{i}.self_attention.wv.weight'
             qkv_name = f'language_model.model.layers.{i}.attention.wqkv.weight'
 
-            if qkv_name in _hg_state_dict.keys():
-                wqkv = _hg_state_dict[qkv_name]
+            if qkv_name in _hf_state_dict.keys():
+                wqkv = _hf_state_dict[qkv_name]
             else:
                 raise AssertionError(f'Missing key {qkv_name}')
             wq, wk, wv = split_qkv(wqkv)
             if not (wq and wk and wv):
                 raise ValueError("llama_for_causa_llm split qkv weight error, maybe not support right now.")
-            _hg_state_dict[q_name] = wq
-            _hg_state_dict[k_name] = wk
-            _hg_state_dict[v_name] = wv
-            _hg_state_dict.pop(qkv_name)
+            _hf_state_dict[q_name] = wq
+            _hf_state_dict[k_name] = wk
+            _hf_state_dict[v_name] = wv
+            _hf_state_dict.pop(qkv_name)
             print(f'merge {q_name}, {k_name}, {v_name} to {qkv_name}')
 
     # split w1 and w3 weight
@@ -217,16 +216,16 @@ def convert_mm_to_hg(_mm_state_dict, _model_config_dict=None):
         gate_name = f'language_model.model.layers.{i}.feed_forward.w1.weight'
         up_name = f'language_model.model.layers.{i}.feed_forward.w3.weight'
         # split w1 和 w3
-        if gate_and_up_name in _hg_state_dict.keys():
-            gate_and_up_weight = _hg_state_dict[gate_and_up_name]
+        if gate_and_up_name in _hf_state_dict.keys():
+            gate_and_up_weight = _hf_state_dict[gate_and_up_name]
             # refer to: torch.cat([gate_proj_weight, up_proj_weight], dim=0)
             gate_weight, up_weight = torch.split(gate_and_up_weight, gate_and_up_weight.size(0) // 2, dim=0)
-            _hg_state_dict[gate_name] = gate_weight
-            _hg_state_dict[up_name] = up_weight
+            _hf_state_dict[gate_name] = gate_weight
+            _hf_state_dict[up_name] = up_weight
         # remove useless weight
-        _hg_state_dict.pop(gate_and_up_name)
+        _hf_state_dict.pop(gate_and_up_name)
         print(f'split {gate_and_up_name} to {gate_name} and {up_name}')
-    return _hg_state_dict
+    return _hf_state_dict
 
 
 def split_by_index_json(_state_dict, _index_json_path):
@@ -251,7 +250,7 @@ def save_by_index_json(_state_dicts, _save_dir):
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='mm2hg tools checkpoint utility arguments',
+    parser = argparse.ArgumentParser(description='mm2hf tools checkpoint utility arguments',
                                      allow_abbrev=False,
                                      conflict_handler='resolve')
     parser.add_argument('--model-size', type=str, required=True,
@@ -260,7 +259,7 @@ if __name__ == "__main__":
                         help='MindSpeed-MM checkpoint path for loading')
     parser.add_argument('--save-dir', type=str, required=True,
                         help='HuggingFace checkpoint path for saving')
-    parser.add_argument('--raw-hg-dir', type=str, required=True,
+    parser.add_argument('--raw-hf-dir', type=str, required=True,
                         help='original raw huggingface checkpoint path for loading')
     parser.add_argument('--trust-remote-code', type=str, required=True, default=False,
                         help='Whether not to allow HuggingFace API to execute code')
@@ -268,14 +267,14 @@ if __name__ == "__main__":
     if unrecognized_args:
         ValueError(f"please check unrecognized args: {unrecognized_args}")
 
-    index_json_path = os.path.join(args.raw_hg_dir, "model.safetensors.index.json")
+    index_json_path = os.path.join(args.raw_hf_dir, "model.safetensors.index.json")
     if not os.path.exists(index_json_path):
         raise ValueError(f"safetensors.index.json not in {index_json_path}")
     model_config_ = get_model_config(args.model_size)
     check_pp_config(model_config_)
     merge_state_dict = load_from_mm(args.load_dir, model_config_)
 
-    hg_state_dict = convert_mm_to_hg(merge_state_dict, model_config_)
-    state_dicts = split_by_index_json(hg_state_dict, index_json_path)
+    hf_state_dict = convert_mm_to_hf(merge_state_dict, model_config_)
+    state_dicts = split_by_index_json(hf_state_dict, index_json_path)
     save_by_index_json(state_dicts, args.save_dir)
 
diff --git a/examples/internvl2/internvl_convert_to_mm_ckpt.py b/examples/internvl2/internvl2_convert_to_mm_ckpt.py
similarity index 99%
rename from examples/internvl2/internvl_convert_to_mm_ckpt.py
rename to examples/internvl2/internvl2_convert_to_mm_ckpt.py
index 40395cc3..33cdc6ea 100644
--- a/examples/internvl2/internvl_convert_to_mm_ckpt.py
+++ b/examples/internvl2/internvl2_convert_to_mm_ckpt.py
@@ -8,6 +8,9 @@ import torch
 from transformers import AutoModelForCausalLM, AutoConfig
 
 
+llm_arch = ''
+
+
 def load_from_hf(load_dir, trust_remote_code):
     # Load Huggingface model.
     hf_model = AutoModelForCausalLM.from_pretrained(load_dir, device_map='cpu', trust_remote_code=trust_remote_code,
@@ -128,7 +131,7 @@ def merge_qkv(wq, wk, wv, hn=64, ng=8):
     return qkv
 
 
-def convert_hg_to_mm(_state_dict, _num_layers):
+def convert_hf_to_mm(_state_dict, _num_layers):
     new_dict = {}
     for key, value in _state_dict.items():
         new_key = None
@@ -369,7 +372,7 @@ if __name__ == '__main__':
     for key, value in state_dict.items():
         print(key, value.shape)
     print(50 * '*')
-    state_dict = convert_hg_to_mm(state_dict, llm_num_layers)
+    state_dict = convert_hf_to_mm(state_dict, llm_num_layers)
     pipeline_state_dicts, remains = split_model_by_pipeline(state_dict, pp_split)
     if len(remains) > 0:
         print(remains)
diff --git a/examples/llava1.5/README.md b/examples/llava1.5/README.md
index 71dcf184..f2ff992a 100644
--- a/examples/llava1.5/README.md
+++ b/examples/llava1.5/README.md
@@ -99,16 +99,13 @@ torch npu 与 CANN包参考链接：[安装包参考链接](https://support.huaw
     pip install torch_npu-2.1.0*-cp310-cp310m-linux_aarch64.whl
     
     # apex for Ascend 参考 https://gitee.com/ascend/apex
-    pip install apex-0.1_ascend*-cp310-cp310m-linux_aarch64.whl
-
-    # 将shell脚本中的环境变量路径修改为真实路径，下面为参考路径
-    source /usr/local/Ascend/ascend-toolkit/set_env.sh 
+    pip install apex-0.1_ascend*-cp310-cp310m-linux_aarch64.whl 
 
     # 安装加速库
     git clone https://gitee.com/ascend/MindSpeed.git
     cd MindSpeed
     # checkout commit from MindSpeed core_r0.6.0
-    git checkout 3da17d56
+    git checkout ab39de78be23e88e2c8b0d25edf6135940990c02
     pip install -r requirements.txt 
     pip3 install -e .
     cd ..
@@ -117,24 +114,6 @@ torch npu 与 CANN包参考链接：[安装包参考链接](https://support.huaw
     pip install -e .
 ```
 
-**注意事项:**
-
-  需要修改 mindspeed/core/transformer/dot_product_attention.py的65行，修改如下：
-
-```python
-def dot_product_attention_forward_wrapper(fn):
-    @wraps(fn)
-    def wrapper(self, query, key, value, attention_mask, attn_mask_type, packed_seq_params):
-        # 注释下一行
-        # attention_mask = get_attention_mask()
-        if get_args().use_flash_attn:
-            return dot_product_attention_forward(self, query, key, value, attention_mask, attn_mask_type, packed_seq_params)
-        return fn(self, query, key, value, attention_mask, attn_mask_type, packed_seq_params)
-
-    return wrapper
-```
-
----
 
 <a id="jump2"></a>
 
@@ -152,90 +131,36 @@ def dot_product_attention_forward_wrapper(fn):
 
 <a id="jump2.2"></a>
 
-#### 2. 权重转换
+#### 2. 权重转换（当前依赖openai-clip库，正在规划重构）
 
 MindSpeeed-MM修改了部分原始网络的结构名称，因此需要使用如下脚本代码对下载的预训练权重进行转换。 当前训练只使用了ViT-L-14-336px和lmsys/vicuna-7b-v1.5两个模型，以下介绍这两个模型从开源仓转换成MindSpeeed-MM所需权重的方法：
 
 - ViT-L-14-336px权重转换
 
-  参考 NVIDIA/Megatron-LM中[Vision model](https://github.com/NVIDIA/Megatron-LM/blob/main/examples/multimodal/README.md#vision-model) ,
-  执行如下命令：
-
-  ```
-  python examples/multimodal/clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 1 --use-te
-  ```
-
-  如果执行环境连接不到外网下载ViT-L-14-336px模型，建议手动下载，再在clip_converter.py中将ViT-L-14-336px路径修改成本地路径
+  脚本参考 NVIDIA/Megatron-LM中[Vision model](https://github.com/NVIDIA/Megatron-LM/blob/core_r0.8.0/examples/multimodal/README.md#vision-model) ,将[ViT-L-14-336px](https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt)权重下载到本地后，
+  执行如下命令：
+  ```bash
+    # 安装依赖（加载原始权重需要依赖openai-clip库）
+    pip install git+https://github.com/openai/CLIP.git
 
-  ```
-  model, _ = clip.load("{dir_to_model}/ViT-L-14-336px.pt", device=device, download_root="")
+    python examples/llava1.5/clip_converter.py \
+      --download-root {dir_to_model}/ViT-L-14-336px.pt \
+      --output {target_dir}
   ```
 
-  其中{dir_to_model}为模型所在的路径。
-  转换的结果在： /some/output/folder/iter_0000001/mp_rank_00/model_optim_rng.pt
-  
-  对于转换后的结果，需要再执行如下转换，其中{target_dir}为最终的权重文件保存路径：
-
-  ```python
-  before = torch.load("/some/output/folder/iter_0000001/mp_rank_00/model_optim_rng.pt")["model"]
-  torch.save(before, "{target_dir}/converted_clip.pt")
-  ```
+  其中{dir_to_model}为下载模型权重所在的路径，转换后权重将保存在{target_dir}/converted_clip.pt。
 
 - lmsys/vicuna-7b-v1.5权重转换
 
-  参考[ModelLink](https://gitee.com/ascend/ModelLink/blob/master/examples/README.md#21-huggingface%E6%9D%83%E9%87%8D%E8%BD%AC%E6%8D%A2%E5%88%B0megatron-lm%E6%A0%BC%E5%BC%8F)中语言模型权重转换的脚本：
-
+  下载权重后执行如下命令：
   ```shell
-  source {cann_dir}/ascend-toolkit/set_env.sh
-  HF_FORMAT_DIR="{dir_to_model}/vicuna-7b-v1.5"
-  MEGATRON_FORMAT_DIR="{target_dir}"
-  TOKENIZER_MODEL="{dir_to_model}/vicuna-7b-v1.5/tokenizer.model" 
-  python tools/checkpoint/convert_ckpt.py \
-       --model-type GPT \
-       --loader llama2_hf \
-       --saver megatron \
-       --target-tensor-parallel-size 1 \
-       --target-pipeline-parallel-size 1 \
-       --load-dir ${HF_FORMAT_DIR} \
-       --save-dir ${MEGATRON_FORMAT_DIR} \
-       --tokenizer-model ${TOKENIZER_MODEL} \
-       --params-dtype bf16
+  python examples/llava1.5/vicuna_converter.py \
+    --load-dir {dir_to_model}/vicuna-7b-v1.5 \
+    --save-dir {target_dir} \
+    --trust-remote-code True # 为保证代码安全，配置trust_remote_code默认为False，用户需要设置为True，并且确保自己下载的模型和数据的安全性
   ```
-
-  其中： {dir_to_model}为vicuna-7b-v1.5所在路径，{target_dir}为转换结果文件路径, {cann_dir}为cann包安装路径。转换的结果在：{target_dir}/iter_0000001/mp_rank_00/model_optim_rng.pt。
-
-由于MindSpeed-MM中模型变量名称跟转换结果有差异，需要再做一次适配：
-
-- 在megatron同级目录，创建convert.py脚本，将如下代码复制到convert.py中，
-- 修改{target_dir}为上一步model_optim_rng.pt所在路径，
-- 修改{dir_to_save_file}为结果文件所在路径，
-- 执行命令：python convert.py
-
-  ```python
-  import torch
-  def convert_param():
-      ckp = torch.load("{target_dir}/model_optim_rng.pt")["model"]["language_model"]
-      target_ckp = {}
-      target_ckp["embedding.word_embeddings.weight"] = ckp["embedding"]["word_embeddings"]["weight"]
-      target_ckp["output_layer.weight"] = ckp["output_layer"]["weight"]
-      for encode_key in ckp["encoder"].keys():
-          if ckp["encoder"][encode_key] is not None:
-              targetkey = encode_key.replace("input_norm", "input_layernorm")
-              targetkey = targetkey.replace(".dense.", ".linear_proj.")
-              targetkey = targetkey.replace("query_key_value", "linear_qkv")
-              targetkey = targetkey.replace("post_attention_norm", "pre_mlp_layernorm")
-              targetkey = targetkey.replace("dense_h_to_4h", "linear_fc1")
-              targetkey = targetkey.replace("dense_4h_to_h", "linear_fc2")
-              targetkey = targetkey.replace("final_norm", "final_layernorm")
-              targetkey = "decoder." + targetkey
-              target_ckp[targetkey] = ckp["encoder"][encode_key]
-      torch.save(target_ckp, "{dir_to_save_file}/converted_vicuna.pt")
-
-  if __name__ == "__main__":
-      convert_param()
-  ```
-
----
+  其中{dir_to_model}为下载模型权重所在的路径，转换后权重将保存在{target_dir}/converted_vicuna.pt。
+  
 
 <a id="jump3"></a>
 
@@ -299,6 +224,26 @@ MindSpeeed-MM修改了部分原始网络的结构名称，因此需要使用如
   ...
 }
 ```
+根据实际情况修改`model.json`中的权重路径为转换后权重，无需预训练权重则传入null。
+```json
+{
+    ...
+    "text_decoder": {
+      ...
+      "ckpt_path": "/<your_vicuna_weights_path>/converted_vicuna.pt"
+    },
+    "image_encoder": {
+      "vision_encoder":{
+        ...
+        "ckpt_path": "/<your_clip_weights_path>/converted_clip.pt"
+      },
+      "vision_projector":{
+        ...
+        "ckpt_path": null
+      }
+    }
+}
+```
 
 【模型保存加载配置】
 
@@ -350,12 +295,12 @@ $save_dir
 ```shell
     # 根据实际情况修改 ascend-toolkit 路径
     source /usr/local/Ascend/ascend-toolkit/set_env.sh 
-    GPUS_PER_NODE=8
+    NPUS_PER_NODE=8
     MASTER_ADDR=locahost
     MASTER_PORT=29501
     NNODES=1
     NODE_RANK=0
-    WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
+    WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES))
 ```
 
 <a id="jump4.3"></a>
diff --git a/examples/llava1.5/clip_converter.py b/examples/llava1.5/clip_converter.py
new file mode 100644
index 00000000..db5caaf8
--- /dev/null
+++ b/examples/llava1.5/clip_converter.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+import argparse
+import os
+
+import clip
+import torch
+
+
+def convert(download_root, output_path, tensor_parallel_size, use_te_layernorm_linear):
+    device = "cpu"
+
+    model, _ = clip.load(download_root, device=device)
+
+    state_dict = model.state_dict()
+    new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
+
+    # Indices from mapping pytorch multihead attention to megatron.
+    kv_channels = 64
+    hidden_dim = 1024
+    num_heads = 16
+    indices = []
+    for i in range(num_heads):
+        lb = i * kv_channels
+        ub = (i + 1) * kv_channels
+        indices.append(torch.arange(lb, ub, dtype=torch.int))
+        indices.append(torch.arange(hidden_dim + lb, hidden_dim + ub, dtype=torch.int))
+        indices.append(torch.arange(2 * hidden_dim + lb, 2 * hidden_dim + ub, dtype=torch.int))
+
+    indices = torch.cat(indices)
+
+    for name, tensor in state_dict.items():
+        # Skip text model.
+        if "visual" not in name:
+            continue
+
+        # Skip final layers not used in our model.
+        if name == "visual.proj" or "ln_post" in name:
+            continue
+
+        # Map parameter names to ones used in megatron.
+        new_name = ""
+        new_tensor = tensor
+        if new_tensor.dtype == torch.float16:
+            new_tensor = new_tensor.to(torch.float32)
+
+        # This is used for chunking some tensors to target tensor parallel size.
+        chunk_dim = None
+
+        if "class_embedding" in name:
+            new_name = "class_token"
+            # Our model uses class token that is expanded to input dimensions already.
+            new_tensor = new_tensor.expand(1, 1, -1)
+        elif "positional_embedding" in name:
+            new_name = "position_embeddings.weight"
+        elif "conv1" in name:
+            new_name = "conv1.weight"
+        elif "ln_pre.weight" in name:
+            new_name = "ln_pre.weight"
+        elif "ln_pre.bias" in name:
+            new_name = "ln_pre.bias"
+        elif "transformer.resblocks" in name:
+            layer_idx = name.split(".")[3]
+            base = f"decoder.layers.{layer_idx}"
+
+            if "attn.in_proj_weight" in name:
+                new_name = f"{base}.self_attention.linear_qkv.weight"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.in_proj_bias" in name:
+                new_name = f"{base}.self_attention.linear_qkv.bias"
+                new_tensor = new_tensor[indices]
+                chunk_dim = 0
+            elif "attn.out_proj.weight" in name:
+                new_name = f"{base}.self_attention.linear_proj.weight"
+                chunk_dim = 1
+            elif "attn.out_proj.bias" in name:
+                new_name = f"{base}.self_attention.linear_proj.bias"
+            elif "ln_1.weight" in name:
+                new_name = f"{base}.input_layernorm.weight"
+                if use_te_layernorm_linear:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_weight"
+            elif "ln_1.bias" in name:
+                new_name = f"{base}.input_layernorm.bias"
+                if use_te_layernorm_linear:
+                    new_name = f"{base}.self_attention.linear_qkv.layer_norm_bias"
+            elif "mlp.c_fc.weight" in name:
+                new_name = f"{base}.mlp.linear_fc1.weight"
+                chunk_dim = 0
+            elif "mlp.c_fc.bias" in name:
+                new_name = f"{base}.mlp.linear_fc1.bias"
+                chunk_dim = 0
+            elif "mlp.c_proj.weight" in name:
+                new_name = f"{base}.mlp.linear_fc2.weight"
+                chunk_dim = 1
+            elif "mlp.c_proj.bias" in name:
+                new_name = f"{base}.mlp.linear_fc2.bias"
+            elif "ln_2.weight" in name:
+                new_name = f"{base}.pre_mlp_layernorm.weight"
+                if use_te_layernorm_linear:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_weight"
+            elif "ln_2.bias" in name:
+                new_name = f"{base}.pre_mlp_layernorm.bias"
+                if use_te_layernorm_linear:
+                    new_name = f"{base}.mlp.linear_fc1.layer_norm_bias"
+
+        if new_name == "":
+            raise AssertionError(f"unexpected layer name {name}")
+
+        if chunk_dim is None:
+            new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
+        else:
+            new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
+
+        for i in range(tensor_parallel_size):
+            # chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
+            new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
+
+    output_path_pt = os.path.join(output_path, "converted_clip.pt")
+    torch.save(new_state_dicts[0]["model"], output_path_pt)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="""
+Convert OpenAI CLIP VIT weights to megatron format.
+
+
+Example usage:
+python examples/llava1.5/clip_converter.py \
+    --download-root {dir_to_model}/ViT-L-14-336px.pt \
+    --output {target_dir}
+""",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+
+    parser.add_argument(
+        "--download-root", type=str, required=True, help="Download folder for OpenAI CLIP weights",
+    )
+    parser.add_argument(
+        "--output", type=str, required=True, help="output directory for megatron state dict file(s)"
+    )
+
+    args = parser.parse_args()
+
+    convert(args.download_root, args.output, 1, False)
+
+    print("all weights have been converted.")
\ No newline at end of file
diff --git a/examples/llava1.5/evaluate_llava1_5.sh b/examples/llava1.5/evaluate_llava1_5.sh
index c85479d7..aa5ea277 100644
--- a/examples/llava1.5/evaluate_llava1_5.sh
+++ b/examples/llava1.5/evaluate_llava1_5.sh
@@ -14,7 +14,7 @@ MASTER_ADDR=localhost
 MASTER_PORT=29501
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=$((NPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 TP=1
 PP=1
diff --git a/examples/llava1.5/inference_llava1_5.sh b/examples/llava1.5/inference_llava1_5.sh
index e225e4a5..e8d183d9 100644
--- a/examples/llava1.5/inference_llava1_5.sh
+++ b/examples/llava1.5/inference_llava1_5.sh
@@ -8,12 +8,12 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 
-GPUS_PER_NODE=1
+NPUS_PER_NODE=1
 MASTER_ADDR=localhost
 MASTER_PORT=29501
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 TP=1
 PP=1
@@ -30,7 +30,7 @@ MM_MODEL="examples/llava1.5/inference_llava.json"
 
 
 DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
+    --nproc_per_node $NPUS_PER_NODE \
     --nnodes $NNODES \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
diff --git a/examples/llava1.5/pretrain_llava1_5.sh b/examples/llava1.5/pretrain_llava1_5.sh
index fa5679f0..58276e71 100644
--- a/examples/llava1.5/pretrain_llava1_5.sh
+++ b/examples/llava1.5/pretrain_llava1_5.sh
@@ -8,17 +8,17 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 
-GPUS_PER_NODE=8
+NPUS_PER_NODE=8
 MASTER_ADDR=localhost
 MASTER_PORT=29501
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 TP=1
 PP=1
 CP=1
-MBS=1
+MBS=8
 GBS=$(($WORLD_SIZE*$MBS/$CP))
 
 MM_DATA="./examples/llava1.5/data.json"
@@ -28,7 +28,7 @@ LOAD_PATH="save_dir"
 SAVE_PATH="save_dir"
 
 DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
+    --nproc_per_node $NPUS_PER_NODE \
     --nnodes $NNODES \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
@@ -91,5 +91,5 @@ torchrun $DISTRIBUTED_ARGS \
     $GPT_ARGS \
     $MM_ARGS \
     $OUTPUT_ARGS \
-    --distributed-backend nccl >> logs/train_${logfile}.log 2>&1
+    --distributed-backend nccl | tee logs/train_${logfile}.log 2>&1
 chmod 440 logs/train_${logfile}.log
\ No newline at end of file
diff --git a/examples/llava1.5/vicuna_converter.py b/examples/llava1.5/vicuna_converter.py
new file mode 100644
index 00000000..ab95b13e
--- /dev/null
+++ b/examples/llava1.5/vicuna_converter.py
@@ -0,0 +1,135 @@
+import argparse
+import os
+
+import torch
+from transformers import AutoModelForCausalLM, AutoConfig
+
+
+def load_from_hf(load_dir, trust_remote_code):
+    # Load Huggingface model.
+    hf_model = AutoModelForCausalLM.from_pretrained(load_dir, device_map='cpu', trust_remote_code=trust_remote_code,
+                                                    torch_dtype=torch.bfloat16, local_files_only=True)
+    print(hf_model)
+    config = AutoConfig.from_pretrained(load_dir, trust_remote_code=trust_remote_code)
+    
+    return hf_model, config
+
+
+def merge_qkv(wq, wk, wv, ng=32):
+    hq, h = wq.shape
+    hkv = wk.shape[0]
+    dq = hq // ng
+    dkv = hkv // ng
+    d = dq + 2 * dkv
+    qkv = torch.zeros([hq + hkv * 2, h], dtype=wq.dtype)
+    for j in range(ng):
+        qkv[j * d : j * d + dq, :] = wq[j * dq : (j + 1) * dq, :]
+        qkv[j * d + dq : j * d + dq + dkv, :] = wk[j * dkv : (j + 1) * dkv, :]
+        qkv[j * d + dq + dkv : j * d + dq + dkv * 2, :] = wv[j * dkv : (j + 1) * dkv, :]
+    
+    return qkv
+
+
+def convert_hg_to_mm(_state_dict, _model_config):
+    _num_layers = _model_config.num_hidden_layers
+    _num_query_groups = _model_config.num_key_value_heads
+    new_dict = {}
+    for key, value in _state_dict.items():
+        new_key = key
+        # 权重映射
+        new_key = new_key.replace('model.embed_tokens', 'embedding.word_embeddings')
+        new_key = new_key.replace('model.layers', 'decoder.layers')
+        new_key = new_key.replace('self_attn.q_proj', 'self_attention.wq')
+        new_key = new_key.replace('self_attn.k_proj', 'self_attention.wk')
+        new_key = new_key.replace('self_attn.v_proj', 'self_attention.wv')
+        new_key = new_key.replace('self_attn.o_proj', 'self_attention.linear_proj')
+        new_key = new_key.replace('gate_proj', 'linear_fc1_gate')
+        new_key = new_key.replace('up_proj', 'linear_fc1_up')
+        new_key = new_key.replace('down_proj', 'linear_fc2')
+        new_key = new_key.replace('post_attention_layernorm', 'pre_mlp_layernorm')
+        new_key = new_key.replace('model.norm', 'decoder.final_layernorm')
+        new_key = new_key.replace('lm_head', 'output_layer')
+
+        # 打印映射过程
+        print(f'mapping {key} to {new_key}')
+        new_dict[new_key] = value
+
+    # qkv权重交织合并
+    for i in range(_num_layers):
+        q_name = f'decoder.layers.{i}.self_attention.wq.weight'
+        k_name = f'decoder.layers.{i}.self_attention.wk.weight'
+        v_name = f'decoder.layers.{i}.self_attention.wv.weight'
+        qkv_name = f'decoder.layers.{i}.self_attention.linear_qkv.weight'
+
+        if q_name in new_dict.keys():
+            wq = new_dict[q_name]
+        else:
+            raise AssertionError(f'Missing key {q_name}')
+        if k_name in new_dict.keys():
+            wk = new_dict[k_name]
+        else:
+            raise AssertionError(f'Missing key {k_name}')
+        if v_name in new_dict.keys():
+            wv = new_dict[v_name]
+        else:
+            raise AssertionError(f'Missing key {v_name}')
+        wqkv = merge_qkv(wq, wk, wv, _num_query_groups)
+        new_dict[qkv_name] = wqkv
+        new_dict.pop(q_name)
+        new_dict.pop(k_name)
+        new_dict.pop(v_name)
+
+        print(f'merge {q_name}, {k_name}, {v_name} to {qkv_name}')
+
+    # 合并mlp的gate和up权重
+    for i in range(_num_layers):
+        gate_name = f'decoder.layers.{i}.mlp.linear_fc1_gate.weight'
+        up_name = f'decoder.layers.{i}.mlp.linear_fc1_up.weight'
+        fc1_name = f'decoder.layers.{i}.mlp.linear_fc1.weight'
+
+        # 合并 w1 和 w3
+        if gate_name in new_dict.keys():
+            gate_proj_weight = new_dict[gate_name]
+        if up_name in new_dict.keys():
+            up_proj_weight = new_dict[up_name]
+        linear_fc1 = torch.cat([gate_proj_weight, up_proj_weight], dim=0)
+        new_dict[fc1_name] = linear_fc1
+
+        # 移除合并前的权重
+        new_dict.pop(gate_name)
+        new_dict.pop(up_name)
+
+        print(f'merge {gate_name} and {up_name} to {fc1_name}')
+
+    return new_dict
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Megatron Checkpoint Utility Arguments',
+                                     allow_abbrev=False,
+                                     conflict_handler='resolve')
+    parser.add_argument('--load-dir', type=str, required=True,
+                        help='HuggingFace weight path for loading')
+    parser.add_argument('--save-dir', type=str, required=True,
+                        help='MindSpeed-MM weight path for saving')
+    parser.add_argument('--trust-remote-code', type=str, required=True, default=False,
+                        help='Whether or not to allow HuggingFace API to execute code')
+    args, unrecognized_args = parser.parse_known_args()
+    if unrecognized_args:
+        print(f"Unrecognized Args: {unrecognized_args}")
+
+    hf_model, model_config = load_from_hf(args.load_dir, args.trust_remote_code)
+    state_dict = hf_model.state_dict()
+    print(50 * '*')
+    print('origin state_dict:')
+    for key, value in state_dict.items():
+        print(key, value.shape)
+    print(50 * '*')
+    new_state_dict = convert_hg_to_mm(state_dict, model_config)
+    print('new state_dict:')
+    for key, value in new_state_dict.items():
+        print(key, value.shape)
+    print(50 * '*')
+    output_path = os.path.join(args.save_dir, 'converted_vicuna.pt')
+    torch.save(new_state_dict, output_path)
+    print('all weights have been converted.')
\ No newline at end of file
diff --git a/examples/qwen2vl/README.md b/examples/qwen2vl/README.md
index 954ee17b..0994bb20 100644
--- a/examples/qwen2vl/README.md
+++ b/examples/qwen2vl/README.md
@@ -49,7 +49,7 @@
 #### 1. 仓库拉取
 
 ```shell
-git clone https://gitee.com/ascend/MindSpeed-MM.git 
+git clone https://gitee.com/ascend/MindSpeed-MM.git
 git clone https://github.com/NVIDIA/Megatron-LM.git
 cd Megatron-LM
 git checkout core_r0.6.0
@@ -61,8 +61,6 @@ mkdir data
 mkdir ckpt
 ```
 
-
-
 #### 2. 环境搭建
 
 torch npu 与 CANN包参考链接：[安装包参考链接](https://support.huawei.com/enterprise/zh/ascend-computing/cann-pid-251168373/software)
@@ -74,7 +72,7 @@ conda activate test
 
 # 安装 torch 和 torch_npu，注意要选择对应python版本、x86或arm的torch、torch_npu及apex包
 # 下载路径参考 https://www.hiascend.com/document/detail/zh/Pytorch/60RC3/configandinstg/instg/insg_0001.html
-pip install torch-2.1.0-cp310-cp310m-manylinux2014_aarch64.whl 
+pip install torch-2.1.0-cp310-cp310m-manylinux2014_aarch64.whl
 pip install torch_npu-2.1.0*-cp310-cp310m-linux_aarch64.whl
 
 # apex for Ascend 参考 https://gitee.com/ascend/apex
@@ -85,7 +83,7 @@ git clone https://gitee.com/ascend/MindSpeed.git
 cd MindSpeed
 # checkout commit from MindSpeed core_r0.6.0
 git checkout ab39de78be23e88e2c8b0d25edf6135940990c02
-pip install -r requirements.txt 
+pip install -r requirements.txt
 pip3 install -e .
 cd ..
 # 替换MindSpeed中的文件
@@ -119,53 +117,49 @@ MindSpeed-MM修改了部分原始网络的结构名称，使用examples/qwen2vl/
 修改qwen2vl_convert_to_mm_ckpt.py中的如下内容,与实际保持一致：
 
 ```python
-hg_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-72B-Instruct'  # huggingface权重目录
+hf_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-72B-Instruct'  # huggingface权重目录
 mm_save_dir = 'ckpt/mm_path/Qwen2-VL-72B-Instruct'  # 转换后保存目录
-pp_size = 16  # 切分的PPstage数量，注意要和finetune脚本中配置的PP一致
+model_size = "72B"  # 根据需要转换的模型，指定配置（ 2B 7B 72B ）
+#model parameters
+model_config = MODEL_CONFIG_DICT[model_size]
 
-llm_num_layers = 80  #LLM的总层数
+#PP parameters: 72B
+pp_size = 16  # 切分的PPstage数量，注意要和finetune脚本中配置的PP一致
 llm_pipeline_num_layers = [4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 4]  # LLM在每个卡上切分的层数，和为 llm_num_layers，注意要和model.json中配置的pipeline_num_layers一致
-
-vit_num_layers = 32  # vit的总层数
 vit_pipeline_num_layers = [32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]  # vit在每个卡上切分的层数，和为 vit_num_layers，注意要和model.json中配置的pipeline_num_layers一致
 
-vit_hidden_size = 1280  # vit的隐藏层size
-vit_attention_heads_num = 16  # vit的注意力heads数
 ```
 
 以Qwen2VL-7B为例
 修改qwen2vl_convert_to_mm_ckpt.py中的如下内容,与实际保持一致：
 
 ```python
-hg_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-7B-Instruct'  # huggingface权重目录
-mm_save_dir = 'ckpt/mm_path/Qwen2-VL-7B-Instruct'  # 转换后保存目录
-pp_size = 4  # 切分的PPstage数量
-
-llm_num_layers = 28  #LLM的总层数
-llm_pipeline_num_layers = [1, 6, 11, 10]  # LLM在每个卡上切分的层数，和为 llm_num_layers
+hf_ckpt_dir = "ckpt/hf_path/Qwen2-VL-7B-Instruct"  #hf原始的权重保存路径
+mm_save_dir = 'ckpt/mm_path/Qwen2-VL-7B-Instruct'  #转换后的权重保存路径
+model_size = "7B"  # 根据需要转换的模型，指定配置（ 2B 7B 72B ）
+#model parameters
+model_config = MODEL_CONFIG_DICT[model_size]
 
-vit_num_layers = 32  # vit的总层数
-vit_pipeline_num_layers = [32, 0, 0, 0]  # vit在每个卡上切分的层数，和为 vit_num_layers
-
-vit_hidden_size = 1280  # vit的隐藏层size
-vit_attention_heads_num = 16  # vit的注意力heads数
+#PP parameters: 7B
+pp_size = 4
+vit_pipeline_num_layers = [32, 0, 0, 0]  # LLM在每个卡上切分的层数，和为llm_num_layers，注意要和model.json中配置的pipeline_num_layers一致
+llm_pipeline_num_layers = [1, 6, 11, 10]  # vit在每个卡上切分的层数，和为vit_num_layers，注意要和model.json中配置的pipeline_num_layers一致
 ```
 以Qwen2VL-2B为例
 修改qwen2vl_convert_to_mm_ckpt.py中的如下内容,与实际保持一致：
 
 ```python
-hg_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-2B-Instruct'  # huggingface权重目录
+hf_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-2B-Instruct'  # huggingface权重目录
 mm_save_dir = 'ckpt/mm_path/Qwen2-VL-2B-Instruct'  # 转换后保存目录
-pp_size = 1  # 2B不需要切分PP
-
-llm_num_layers = 28  #LLM的总层数
-llm_pipeline_num_layers = [28]  # LLM在每个卡上切分的层数，和为llm_num_layers
+model_size = "2B"  # 根据需要转换的模型，指定配置（ 2B 7B 72B ）
+#model parameters
+model_config = MODEL_CONFIG_DICT[model_size]
 
-vit_num_layers = 32  # vit的总层数
-vit_pipeline_num_layers = [32]  # vit在每个卡上切分的层数，和为vit_num_layers
+#PP parameters: 2B
+pp_size = 1  # 2B不需要切分PP
+llm_pipeline_num_layers = [28]  # LLM在每个卡上切分的层数，和为llm_num_layers，注意要和model.json中配置的pipeline_num_layers一致
+vit_pipeline_num_layers = [32]  # vit在每个卡上切分的层数，和为vit_num_layers，注意要和model.json中配置的pipeline_num_layers一致
 
-vit_hidden_size = 1280  # vit的隐藏层size
-vit_attention_heads_num = 16  # vit的注意力heads数
 ```
 
 启动脚本
@@ -199,7 +193,7 @@ LOAD_PATH="ckpt/Qwen2-VL-7B-Instruct"
    ├── data
        ├── COCO2017
            ├── train2017
-           
+
        ├── llava_instruct_150k.json
        ├── mllm_format_llava_instruct_data.json
        ...
@@ -212,6 +206,32 @@ dataset_param->basic_parameters->dataset
 
 同时注意`data.json`中`dataset_param->basic_parameters->max_samples`的配置，会限制数据只读`max_samples`条，这样可以快速验证功能。如果正式训练时，可以把该参数去掉则读取全部的数据。
 
+#### 2.纯文本或有图无图混合训练数据(以LLaVA-Instruct-150K为例)：
+现在本框架已经支持纯文本/混合数据（有图像和无图像数据混合训练）。
+
+在数据构造时，对于包含图片的数据，需要保留`image`这个键值。
+```python
+{
+  "id": your_id,
+  "image": your_image_path,
+  "conversations": [
+      {"from": "human", "value": your_query},
+      {"from": "gpt", "value": your_response},
+  ],
+}
+```
+
+在数据构造时，对于纯文本数据，可以去除`image`这个键值。
+```python
+{
+  "id": your_id,
+  "conversations": [
+      {"from": "human", "value": your_query},
+      {"from": "gpt", "value": your_response},
+  ],
+}
+```
+
 
 ## 微调
 
@@ -262,7 +282,7 @@ dataset_param->basic_parameters->dataset
 ```shell
 ...
 # 加载路径
-LOAD_PATH="ckpt/Qwen2-VL-7B-Instruct"
+LOAD_PATH="ckpt/mm_path/Qwen2-VL-7B-Instruct"
 # 保存路径
 SAVE_PATH="save_dir"
 ...
@@ -297,13 +317,13 @@ $save_dir
 
 ```shell
 # 根据实际情况修改 ascend-toolkit 路径
-source /usr/local/Ascend/ascend-toolkit/set_env.sh 
-GPUS_PER_NODE=8
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+NPUS_PER_NODE=8
 MASTER_ADDR=locahost
 MASTER_PORT=29501
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE * $NNODES))
 ```
 
 
@@ -313,20 +333,10 @@ WORLD_SIZE=$(($GPUS_PER_NODE * $NNODES))
 以Qwen2VL-7B为例，启动微调训练任务。
 
 ```shell
-bash examples/qawen2vl/finetune_qwen2vl_7b.sh
+bash examples/qwen2vl/finetune_qwen2vl_7b.sh
 ```
 
 
-
-## LoRA
-
-LoRA为框架通用能力，如需在基线脚本上增加LoRA能力请参考LoRA特性文档[docs/features/lora_finetune.md](https://gitee.com/ascend/MindSpeed-MM/blob/master/docs/features/lora_finetune.md)
-
-
-
-
-
-
 ## 推理
 
 #### 1、准备工作（以微调环境为基础，包括环境安装、权重下载及转换-目前支持PP切分的推理）
@@ -369,19 +379,54 @@ model_path = "Qwen2-VL-7B-Instruct"     # hf原仓目录
 修改qwen2vl_convert_to_hf.py中的如下内容,与qwen2vl_convert_to_mm_ckpt.py保持一致：
 ```python
 pp_size = 4
-vit_num_layers = 32
 vit_pipeline_num_layers = [32, 0, 0, 0]
-llm_num_layers = 28
 llm_pipeline_num_layers = [1, 6, 11, 10]
+```
+在qwen2vl_convert_to_hf.py中根据模型选择模型配置
+```python
+model_size = "7B"  # 根据需要转换的模型，指定配置（ 2B 7B 72B ）
+#model parameters
+model_config = MODEL_CONFIG_DICT[model_size]
+```
 
-vit_hidden_size = 1280
-vit_attention_heads_num = 16
+#### 3.执行转换脚本
+```bash
+python examples/qwen2vl/qwen2vl_convert_to_hf.py
 ```
 
+## 训练后重新切分权重（pp切分）
+
+权重下载及转换部分会把权重进行pp切分，在微调后，如果需要对权重重新进行pp切分，可使用examples/qwen2vl/qwen2vl_convert_pp_to_pp.py脚本对微调后的权重进行切分
+
+#### 1.修改路径
+修改qwen2vl_convert_pp_to_pp.py中的如下内容,与实际保持一致：
+```python
+mm_save_dir = "save_dir"            # 微调后保存的权重目录
+new_save_dir = "new_pp_save_dir"    # 希望重新pp切分后保存的目录
+```
+
+#### 2.修改配置
+修改qwen2vl_convert_to_hf.py中的如下内容,与qwen2vl_convert_to_mm_ckpt.py保持一致：
+```python
+vit_num_layers = 32
+llm_num_layers = 28
+```
+
+```python
+old_pp_size = 4
+old_vit_pipeline_num_layers = [32, 0, 0, 0]
+old_llm_pipeline_num_layers = [1, 6, 11, 10]
+```
+修改qwen2vl_convert_to_hf.py中的如下内容，使之与期望的切分配置一致
+```python
+new_pp_size = 2
+new_vit_pipeline_num_layers = [32, 0]
+new_llm_pipeline_num_layers = [14, 14]
+```
 
 #### 3.执行转换脚本
 ```bash
-python examples/qwen2vl/qwen2vl_convert_to_hf.py
+python examples/qwen2vl/qwen2vl_convert_pp_to_pp.py
 ```
 
 
@@ -446,4 +491,4 @@ bash examples/qwen2vl/evaluate_qwen2vl_7b.sh
 ## 注意事项
 1. 在使用流水线并行策略进行多机训练可能会出现卡住现象，可参考[此处](https://gitee.com/ascend/MindSpeed/pulls/1627/files)修改。
 2. 在 `finetune_xx.sh`里，与模型结构相关的参数并不生效，以`examples/qwen2vl/model_xb.json`里同名参数配置为准，非模型结构的训练相关参数在 `finetune_xx.sh`修改。
-
+3. LoRA为框架通用能力，当前功能已支持，可参考[LoRA特性文档](https://gitee.com/ascend/MindSpeed-MM/blob/master/docs/features/lora_finetune.md)。
diff --git a/examples/qwen2vl/data_2b.json b/examples/qwen2vl/data_2b.json
index cb0efe1f..0bd5a7ec 100644
--- a/examples/qwen2vl/data_2b.json
+++ b/examples/qwen2vl/data_2b.json
@@ -43,6 +43,8 @@
         "collate_param": {
             "model_name": "qwen2vl",
             "ignore_pad_token_for_loss": true
-        }
+        },
+        "pin_memory": true,
+        "num_workers": 8
     }
 }
\ No newline at end of file
diff --git a/examples/qwen2vl/data_72b.json b/examples/qwen2vl/data_72b.json
index 23020cb8..5be260f1 100644
--- a/examples/qwen2vl/data_72b.json
+++ b/examples/qwen2vl/data_72b.json
@@ -43,6 +43,8 @@
         "collate_param": {
             "model_name": "qwen2vl",
             "ignore_pad_token_for_loss": true
-        }
+        },
+        "pin_memory": true,
+        "num_workers": 8
     }
 }
\ No newline at end of file
diff --git a/examples/qwen2vl/data_7b.json b/examples/qwen2vl/data_7b.json
index 09cbfd5c..bdf383f6 100644
--- a/examples/qwen2vl/data_7b.json
+++ b/examples/qwen2vl/data_7b.json
@@ -25,7 +25,7 @@
         },
         "attr": {
             "system": null,
-            "images": null,
+            "images": "images",
             "videos": null,
             "messages": "messages",
             "role_tag": "role",
@@ -43,6 +43,8 @@
         "collate_param": {
             "model_name": "qwen2vl",
             "ignore_pad_token_for_loss": true
-        }
+        },
+        "pin_memory": true,
+        "num_workers": 8
     }
 }
\ No newline at end of file
diff --git a/examples/qwen2vl/dot_product_attention.py b/examples/qwen2vl/dot_product_attention.py
index e6dc939b..b6e9d335 100644
--- a/examples/qwen2vl/dot_product_attention.py
+++ b/examples/qwen2vl/dot_product_attention.py
@@ -129,8 +129,8 @@ def dot_product_attention_forward(
         query, key, value, indices_q, cu_seq_lens, max_seq_lens = _unpad_input(
             query, key, value, attention_mask, seq_length
         )
-        attention_mask_npu = torch.from_numpy(
-            np.triu(np.ones([max_seq_lens, max_seq_lens]), k=1)).bool().to(torch.cuda.current_device())
+        attention_mask_npu = torch.triu(
+            torch.ones([max_seq_lens, max_seq_lens], dtype=torch.bool, device=query.device), diagonal=1)
         attn_output_unpad = torch_npu.npu_fusion_attention(
             query, key, value, n_head,
             pse=None,
@@ -149,8 +149,8 @@ def dot_product_attention_forward(
         query = query.transpose(0, 1).contiguous()
         key = key.transpose(0, 1).contiguous()
         value = value.transpose(0, 1).contiguous()
-        attention_mask_npu = torch.from_numpy(
-            np.triu(np.ones([query.shape[1], key.shape[1]]), k=1)).bool().to(torch.cuda.current_device())
+        attention_mask_npu = torch.triu(
+            torch.ones([query.shape[1], key.shape[1]], dtype=torch.bool, device=query.device), diagonal=1)
         attn_output = torch_npu.npu_fusion_attention(
             query, key, value, n_head, 'BSND',
             keep_prob=1.0,
@@ -158,7 +158,7 @@ def dot_product_attention_forward(
             atten_mask=attention_mask_npu)[0]
         attn_output = rearrange(attn_output, 'b s h d -> s b (h d)', s=seq_length, b=bsz)
     return attn_output
-        
+
 
 def _unpad_input(
         query_state: torch.Tensor,
diff --git a/examples/qwen2vl/evaluate_qwen2vl_7b.sh b/examples/qwen2vl/evaluate_qwen2vl_7b.sh
index fde5aa1c..d05ad470 100644
--- a/examples/qwen2vl/evaluate_qwen2vl_7b.sh
+++ b/examples/qwen2vl/evaluate_qwen2vl_7b.sh
@@ -13,7 +13,6 @@ export HCCL_CONNECT_TIMEOUT=7200
 export HCCL_EXEC_TIMEOUT=7200
 export NPU_ASD_ENABLE=0
 export ASCEND_LAUNCH_BLOCKING=0
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 export TOKENIZERS_PARALLELISM=false
diff --git a/examples/qwen2vl/finetune_qwen2vl_2b.sh b/examples/qwen2vl/finetune_qwen2vl_2b.sh
index 0b561960..dc425f85 100644
--- a/examples/qwen2vl/finetune_qwen2vl_2b.sh
+++ b/examples/qwen2vl/finetune_qwen2vl_2b.sh
@@ -10,7 +10,6 @@ export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export NPU_ASD_ENABLE=0
 export ASCEND_LAUNCH_BLOCKING=0
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 
 NPUS_PER_NODE=8
@@ -24,7 +23,7 @@ WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 MM_DATA="./examples/qwen2vl/data_2b.json"
 MM_MODEL="./examples/qwen2vl/model_2b.json"
 MM_TOOL="./mindspeed_mm/tools/tools.json"
-LOAD_PATH="ckpt/Qwen2-VL-7B-Instruct"
+LOAD_PATH="ckpt/mm_path/Qwen2-VL-2B-Instruct"
 SAVE_PATH="save_dir"
 
 TP=1
diff --git a/examples/qwen2vl/finetune_qwen2vl_72b.sh b/examples/qwen2vl/finetune_qwen2vl_72b.sh
index 99f7c759..ecbf1fe4 100644
--- a/examples/qwen2vl/finetune_qwen2vl_72b.sh
+++ b/examples/qwen2vl/finetune_qwen2vl_72b.sh
@@ -10,7 +10,6 @@ export CPU_AFFINITY_CONF=2
 export HCCL_CONNECT_TIMEOUT=1200
 export NPU_ASD_ENABLE=0
 export ASCEND_LAUNCH_BLOCKING=0
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 export MULTI_STREAM_MEMORY_REUSE=2
 export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True"
@@ -34,7 +33,7 @@ MM_DATA="./examples/qwen2vl/data_72b.json"
 MM_MODEL="./examples/qwen2vl/model_72b.json"
 MM_TOOL="./mindspeed_mm/tools/tools.json"
 # 需要先根据readme把huggingface格式模型转换为mm格式
-LOAD_PATH="ckpt/Qwen2-VL-72B-Instruct"
+LOAD_PATH="ckpt/mm_path/Qwen2-VL-72B-Instruct"
 SAVE_PATH="save_dir"
 
 TP=1
diff --git a/examples/qwen2vl/finetune_qwen2vl_7b.sh b/examples/qwen2vl/finetune_qwen2vl_7b.sh
index a271be79..5435a79e 100644
--- a/examples/qwen2vl/finetune_qwen2vl_7b.sh
+++ b/examples/qwen2vl/finetune_qwen2vl_7b.sh
@@ -10,7 +10,6 @@ export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export NPU_ASD_ENABLE=0
 export ASCEND_LAUNCH_BLOCKING=0
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 
 NPUS_PER_NODE=8
@@ -24,10 +23,8 @@ WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 MM_DATA="./examples/qwen2vl/data_7b.json"
 MM_MODEL="./examples/qwen2vl/model_7b.json"
 MM_TOOL="./mindspeed_mm/tools/tools.json"
-LOAD_PATH="./ckpt/mm_path/Qwen2-VL-7B-Instruct"
-# timestamp=$(date +"%Y%m%d_%H%M%S")
-# SAVE_PATH=$("save_dir/" + $timestamp)
-SAVE_PATH="save_dir/"
+LOAD_PATH="ckpt/mm_path/Qwen2-VL-7B-Instruct"
+SAVE_PATH="save_dir"
 
 TP=1
 PP=4
@@ -70,7 +67,7 @@ GPT_ARGS="
     --lr 1.0e-5 \
     --lr-decay-style cosine \
     --weight-decay 0 \
-    --train-iters 100 \
+    --train-iters 10000 \
     --lr-warmup-fraction 0.1 \
     --clip-grad 0.0 \
     --adam-beta1 0.9 \
@@ -97,9 +94,9 @@ MM_ARGS="
 
 OUTPUT_ARGS="
     --log-interval 1 \
-    --save-interval 100 \
-    --eval-interval 100 \
-    --eval-iters 100 \
+    --save-interval 10000 \
+    --eval-interval 10000 \
+    --eval-iters 5000 \
     --save $SAVE_PATH \
 "
 logfile=$(date +%Y%m%d)_$(date +%H%M%S)
diff --git a/examples/qwen2vl/inference_qwen2vl_2b.json b/examples/qwen2vl/inference_qwen2vl_2b.json
new file mode 100644
index 00000000..2c239680
--- /dev/null
+++ b/examples/qwen2vl/inference_qwen2vl_2b.json
@@ -0,0 +1,159 @@
+{
+    "pipeline_class": "Qwen2VlPipeline",
+    "img_context_token_id": 151655,
+    "image_encoder": {
+        "vision_encoder": {
+            "model_id": "qwen2vit",
+            "num_layers": 32,
+            "hidden_size": 1280,
+            "ffn_hidden_size": 5120,
+            "llm_hidden_size": 1536,
+            "num_attention_heads": 16,
+            "hidden_dropout": 0.0,
+            "attention_dropout": 0.0,
+            "in_channels": 3,
+            "patch_size": 14,
+            "spatial_merge_size": 2,
+            "temporal_patch_size": 2,
+            "layernorm_epsilon": 1e-06,
+            "normalization": "LayerNorm",
+            "fp16": false,
+            "bf16": true,
+            "params_dtype": "bf16",
+            "activation_func": "quick_gelu",
+            "freeze": true,
+            "use_fused_rotary_pos_emb": true,
+            "post_layer_norm": false,
+            "pipeline_num_layers": [32]
+        },
+        "vision_projector": {
+            "model_id": "lnmlp",
+            "num_layers": 1,
+            "num_attention_heads": 1,
+            "gated_linear_unit": false,
+            "bias_activation_fusion": false,
+            "add_bias_linear": true,
+            "input_size": 1280,
+            "hidden_size": 1536,
+            "ffn_hidden_size": 5120,
+            "activation_func": "gelu",
+            "bf16": true,
+            "params_dtype": "bf16",
+            "freeze": true
+        }
+    },
+    "text_decoder": {
+        "model_id": "qwen2lm",
+        "num_layers": 28,
+        "pipeline_num_layers": [28],
+        "hidden_size": 1536,
+        "ffn_hidden_size": 8960,
+        "num_attention_heads": 12,
+        "seq_length": 1024,
+        "max_position_embeddings": 32768,
+        "vocab_size": 151936,
+        "rope_theta": 1000000.0,
+        "untie_embeddings_and_output_weights": false,
+        "disable_bias_linear": true,
+        "attention_dropout": 0.0,
+        "init_method_std": 0.01,
+        "hidden_dropout": 0.0,
+        "position_embedding_type": "mrope",
+        "normalization": "RMSNorm",
+        "activation_func": "silu",
+        "use_fused_rotary_pos_emb": true,
+        "attention_softmax_in_fp32": true,
+        "params_dtype": "bf16",
+        "bf16": true,
+        "parallel_output": true,
+        "group_query_attention": true,
+        "num_query_groups": 2,
+        "mrope_section": [16, 24, 24],
+        "rope_scaling": null,
+        "gated_linear_unit": true,
+        "layernorm_epsilon": 1e-06,
+        "add_bias_linear":false,
+        "add_qkv_bias": true,
+        "sequence_parallel": false,
+        "tokenizer_type": "PretrainedFromHF",
+        "is_encoder_decoder": false
+    },
+    "text_encoder": null,
+    "video_encoder": null,
+    "dtype": "bf16",
+    "device": "npu",
+    "tokenizer": {
+        "hub_backend": "hf",
+        "autotokenizer_name": "AutoTokenizer",
+        "from_pretrained": "ckpt/hf_path/Qwen2-VL-2B-Instruct",
+        "local_files_only":false
+    },
+    "generation_config": {
+        "bos_token_id": 151643,
+        "do_sample": true,
+        "output_attentions": false,
+        "output_hidden_states": false,
+        "max_length": 20,
+        "min_length": 0,
+        "min_new_tokens": null,
+        "constraints": null,
+        "prompt_lookup_num_tokens": null,
+        "guidance_scale": null,
+        "bad_words_ids": null,
+        "begin_suppress_tokens": null,
+        "diversity_penalty": 0.0,
+        "early_stopping": false,
+        "encoder_no_repeat_ngram_size": 0,
+        "encoder_repetition_penalty": 1.0,
+        "epsilon_cutoff": 0.0,
+        "eta_cutoff": 0.0,
+        "exponential_decay_length_penalty": null,
+        "forced_bos_token_id": null,
+        "forced_decoder_ids": null,
+        "forced_eos_token_id": null,
+        "length_penalty": 1.0,
+        "low_memory": null,
+        "max_time": null,
+        "no_repeat_ngram_size": 0,
+        "num_assistant_tokens": 5,
+        "num_assistant_tokens_schedule": "heuristic",
+        "num_beam_groups": 1,
+        "num_return_groups": 1,
+        "num_return_sequences": 1,
+        "output_scores": false,
+        "output_logits": null,
+        "penalty_alpha": null,
+        "remove_invalid_values": false,
+        "repetition_penalty": 1.0,
+        "return_dict_in_generate": false,
+        "sequence_bias": null,
+        "suppress_tokens": null,
+        "typical_p": 1.0,
+        "force_words_ids": null,
+        "num_beams": 1,
+        "renormalize_logits": false,
+        "use_cache": true,
+        "eos_token_id": [
+            151645,
+            151643
+        ],
+        "max_new_tokens": 256,
+        "pad_token_id": 151643,
+        "temperature": 0.01,
+        "top_k": 1,
+        "top_p": 0.001,
+        "dola_layers": null,
+        "cache_implementation": null,
+        "cache_config": null,
+        "return_legacy_cache": null,
+        "min_p": null,
+        "token_healing": false,
+        "watermarking_config": null,
+        "decoder_start_token_id": null,
+        "max_matching_ngram_size": null,
+        "stop_strings": null
+    },
+    "image_processer_path": "ckpt/hf_path/Qwen2-VL-2B-Instruct/preprocessor_config.json",
+    "image_path": "examples/qwen2vl/demo.jpeg",
+    "prompts": "Describe this image and keep it within 100 words."
+}
diff --git a/examples/qwen2vl/inference_qwen2vl_2b.sh b/examples/qwen2vl/inference_qwen2vl_2b.sh
new file mode 100644
index 00000000..31b90a02
--- /dev/null
+++ b/examples/qwen2vl/inference_qwen2vl_2b.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+export ASCEND_GLOBAL_LOG_LEVEL=3
+export TASK_QUEUE_ENABLE=2
+export COMBINED_ENABLE=1
+export CPU_AFFINITY_CONF=1
+export HCCL_CONNECT_TIMEOUT=1200
+export NPU_ASD_ENABLE=0
+export ACLNN_CACHE_LIMIT=100000
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+
+NPUS_PER_NODE=1
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+MM_MODEL="./examples/qwen2vl/inference_qwen2vl_2b.json"
+LOAD_PATH="ckpt/mm_path/Qwen2-VL-2B-Instruct"
+
+TP=1
+PP=1
+CP=1
+SEQ_LEN=1024
+MBS=1
+GRAD_ACC_STEP=1
+DP=$(($WORLD_SIZE/$TP/$PP/$CP))
+GBS=$(($MBS*$GRAD_ACC_STEP*$DP))
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --use-mcore-models \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --num-layers 1 \
+    --hidden-size 1 \
+    --ffn-hidden-size 1 \
+    --num-attention-heads 1 \
+    --tokenizer-type NullTokenizer \
+    --vocab-size 1 \
+    --seq-length 1 \
+    --max-position-embeddings 1 \
+    --make-vocab-size-divisible-by 1 \
+    --init-method-std 0.01 \
+    --normalization RMSNorm \
+    --use-fused-rmsnorm \
+    --swiglu \
+    --use-fused-swiglu \
+    --seed 42 \
+    --bf16 \
+    --load $LOAD_PATH \
+    --variable-seq-lengths \
+    --enable-one-logger \
+    --use-flash-attn \
+    --no-load-optim \
+    --no-load-rng
+"
+
+MM_ARGS="
+    --mm-model $MM_MODEL
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 10000 \
+    --eval-iters 5000 \
+"
+
+torchrun $DISTRIBUTED_ARGS inference_vlm.py \
+    $GPT_ARGS \
+    $MM_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl
diff --git a/examples/qwen2vl/inference_qwen2vl_72b.json b/examples/qwen2vl/inference_qwen2vl_72b.json
new file mode 100644
index 00000000..66400587
--- /dev/null
+++ b/examples/qwen2vl/inference_qwen2vl_72b.json
@@ -0,0 +1,158 @@
+{
+    "pipeline_class": "Qwen2VlPipeline",
+    "img_context_token_id": 151655,
+    "image_encoder": {
+        "vision_encoder": {
+            "model_id": "qwen2vit",
+            "num_layers": 32,
+            "hidden_size": 1280,
+            "ffn_hidden_size": 5120,
+            "llm_hidden_size": 8192,
+            "num_attention_heads": 16,
+            "hidden_dropout": 0.0,
+            "attention_dropout": 0.0,
+            "in_channels": 3,
+            "patch_size": 14,
+            "spatial_merge_size": 2,
+            "temporal_patch_size": 2,
+            "layernorm_epsilon": 1e-06,
+            "normalization": "LayerNorm",
+            "fp16": false,
+            "bf16": true,
+            "params_dtype": "bf16",
+            "activation_func": "quick_gelu",
+            "freeze": true,
+            "use_fused_rotary_pos_emb": true,
+            "post_layer_norm": false,
+            "pipeline_num_layers": [32, 0, 0, 0, 0, 0, 0, 0]
+        },
+        "vision_projector": {
+            "model_id": "lnmlp",
+            "num_layers": 1,
+            "num_attention_heads": 1,
+            "gated_linear_unit": false,
+            "bias_activation_fusion": false,
+            "add_bias_linear": true,
+            "input_size": 1280,
+            "hidden_size": 8192,
+            "ffn_hidden_size": 5120,
+            "activation_func": "gelu",
+            "bf16": true,
+            "params_dtype": "bf16",
+            "freeze": true
+        }
+    },
+    "text_decoder": {
+        "model_id": "qwen2lm",
+        "num_layers": 80,
+        "pipeline_num_layers": [8, 10, 10, 10, 10, 12, 12, 8],
+        "hidden_size": 8192,
+        "ffn_hidden_size": 29568,
+        "num_attention_heads": 64,
+        "max_position_embeddings": 32768,
+        "vocab_size": 152064,
+        "rope_theta": 1000000.0,
+        "untie_embeddings_and_output_weights": true,
+        "disable_bias_linear": true,
+        "attention_dropout": 0.0,
+        "init_method_std": 0.01,
+        "hidden_dropout": 0.0,
+        "position_embedding_type": "mrope",
+        "normalization": "RMSNorm",
+        "activation_func": "silu",
+        "use_fused_rotary_pos_emb": true,
+        "attention_softmax_in_fp32": true,
+        "params_dtype": "bf16",
+        "bf16": true,
+        "parallel_output": true,
+        "group_query_attention": true,
+        "num_query_groups": 8,
+        "mrope_section": [16, 24, 24],
+        "rope_scaling": null,
+        "gated_linear_unit": true,
+        "layernorm_epsilon": 1e-06,
+        "add_bias_linear":false,
+        "add_qkv_bias": true,
+        "sequence_parallel": false,
+        "tokenizer_type": "PretrainedFromHF",
+        "is_encoder_decoder": false
+    },
+    "text_encoder": null,
+    "video_encoder": null,
+    "dtype": "bf16",
+    "device": "npu",
+    "tokenizer": {
+        "hub_backend": "hf",
+        "autotokenizer_name": "AutoTokenizer",
+        "from_pretrained": "ckpt/hf_path/Qwen2-VL-72B-Instruct",
+        "local_files_only":false
+    },
+    "generation_config": {
+        "bos_token_id": 151643,
+        "do_sample": true,
+        "output_attentions": false,
+        "output_hidden_states": false,
+        "max_length": 20,
+        "min_length": 0,
+        "min_new_tokens": null,
+        "constraints": null,
+        "prompt_lookup_num_tokens": null,
+        "guidance_scale": null,
+        "bad_words_ids": null,
+        "begin_suppress_tokens": null,
+        "diversity_penalty": 0.0,
+        "early_stopping": false,
+        "encoder_no_repeat_ngram_size": 0,
+        "encoder_repetition_penalty": 1.0,
+        "epsilon_cutoff": 0.0,
+        "eta_cutoff": 0.0,
+        "exponential_decay_length_penalty": null,
+        "forced_bos_token_id": null,
+        "forced_decoder_ids": null,
+        "forced_eos_token_id": null,
+        "length_penalty": 1.0,
+        "low_memory": null,
+        "max_time": null,
+        "no_repeat_ngram_size": 0,
+        "num_assistant_tokens": 5,
+        "num_assistant_tokens_schedule": "heuristic",
+        "num_beam_groups": 1,
+        "num_return_groups": 1,
+        "num_return_sequences": 1,
+        "output_scores": false,
+        "output_logits": null,
+        "penalty_alpha": null,
+        "remove_invalid_values": false,
+        "repetition_penalty": 1.05,
+        "return_dict_in_generate": false,
+        "sequence_bias": null,
+        "suppress_tokens": null,
+        "typical_p": 1.0,
+        "force_words_ids": null,
+        "num_beams": 1,
+        "renormalize_logits": false,
+        "use_cache": true,
+        "eos_token_id": [
+            151645,
+            151643
+        ],
+        "max_new_tokens": 256,
+        "pad_token_id": 151643,
+        "temperature": 0.01,
+        "top_k": 1,
+        "top_p": 0.001,
+        "dola_layers": null,
+        "cache_implementation": null,
+        "cache_config": null,
+        "return_legacy_cache": null,
+        "min_p": null,
+        "token_healing": false,
+        "watermarking_config": null,
+        "decoder_start_token_id": null,
+        "max_matching_ngram_size": null,
+        "stop_strings": null
+    },
+    "image_processer_path": "ckpt/hf_path/Qwen2-VL-72B-Instruct/preprocessor_config.json",
+    "image_path": "examples/qwen2vl/demo.jpeg",
+    "prompts": "Describe this image and keep it within 100 words."
+}
diff --git a/examples/qwen2vl/inference_qwen2vl_72b.sh b/examples/qwen2vl/inference_qwen2vl_72b.sh
new file mode 100644
index 00000000..db19254a
--- /dev/null
+++ b/examples/qwen2vl/inference_qwen2vl_72b.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+source /usr/local/Ascend/ascend-toolkit/set_env.sh
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export ASCEND_SLOG_PRINT_TO_STDOUT=0
+export ASCEND_GLOBAL_LOG_LEVEL=3
+export TASK_QUEUE_ENABLE=2
+export COMBINED_ENABLE=1
+export CPU_AFFINITY_CONF=1
+export HCCL_CONNECT_TIMEOUT=1200
+export NPU_ASD_ENABLE=0
+export ACLNN_CACHE_LIMIT=100000
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+
+NPUS_PER_NODE=8
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NNODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
+
+MM_MODEL="./examples/qwen2vl/inference_qwen2vl_72b.json"
+LOAD_PATH="ckpt/mm_path/Qwen2-VL-72B-Instruct"
+
+TP=1
+PP=8
+CP=1
+MBS=1
+GRAD_ACC_STEP=1
+DP=$(($WORLD_SIZE/$TP/$PP/$CP))
+GBS=$(($MBS*$GRAD_ACC_STEP*$DP))
+
+DISTRIBUTED_ARGS="
+    --nproc_per_node $NPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+
+GPT_ARGS="
+    --use-mcore-models \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size ${PP} \
+    --micro-batch-size ${MBS} \
+    --global-batch-size ${GBS} \
+    --num-layers 1 \
+    --hidden-size 1 \
+    --ffn-hidden-size 1 \
+    --num-attention-heads 1 \
+    --tokenizer-type NullTokenizer \
+    --vocab-size 1 \
+    --seq-length 1 \
+    --max-position-embeddings 1 \
+    --make-vocab-size-divisible-by 1 \
+    --init-method-std 0.01 \
+    --normalization RMSNorm \
+    --use-fused-rmsnorm \
+    --swiglu \
+    --use-fused-swiglu \
+    --seed 42 \
+    --bf16 \
+    --load $LOAD_PATH \
+    --variable-seq-lengths \
+    --enable-one-logger \
+    --use-flash-attn \
+    --no-load-optim \
+    --no-load-rng
+"
+
+MM_ARGS="
+    --mm-model $MM_MODEL
+"
+
+OUTPUT_ARGS="
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 10000 \
+    --eval-iters 5000 \
+"
+
+torchrun $DISTRIBUTED_ARGS inference_vlm.py \
+    $GPT_ARGS \
+    $MM_ARGS \
+    $OUTPUT_ARGS \
+    --distributed-backend nccl
diff --git a/examples/qwen2vl/inference_qwen2vl_7b.json b/examples/qwen2vl/inference_qwen2vl_7b.json
index a0120a93..b0126f4f 100644
--- a/examples/qwen2vl/inference_qwen2vl_7b.json
+++ b/examples/qwen2vl/inference_qwen2vl_7b.json
@@ -154,7 +154,6 @@
         "stop_strings": null
     },
     "image_processer_path": "ckpt/hf_path/Qwen2-VL-7B-Instruct/preprocessor_config.json",
-    "image_path": "",
-    "prompts": "Describe this image and keep it within 100 words.",
-    "temperature": 0
+    "image_path": "examples/qwen2vl/demo.jpeg",
+    "prompts": "Describe this image and keep it within 100 words."
 }
diff --git a/examples/qwen2vl/inference_qwen2vl_7b.sh b/examples/qwen2vl/inference_qwen2vl_7b.sh
index f5b64867..f508a4ff 100644
--- a/examples/qwen2vl/inference_qwen2vl_7b.sh
+++ b/examples/qwen2vl/inference_qwen2vl_7b.sh
@@ -1,8 +1,6 @@
 #!/bin/bash
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
-# 通过此配置选择使用的NPU卡
-# export ASCEND_RT_VISIBLE_DEVICES=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export ASCEND_SLOG_PRINT_TO_STDOUT=0
 export ASCEND_GLOBAL_LOG_LEVEL=3
@@ -11,12 +9,10 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export NPU_ASD_ENABLE=0
-export ASCEND_LAUNCH_BLOCKING=0
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 
-NPUS_PER_NODE=4
+NPUS_PER_NODE=1
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
@@ -24,10 +20,10 @@ NODE_RANK=0
 WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 MM_MODEL="./examples/qwen2vl/inference_qwen2vl_7b.json"
-LOAD_PATH="/home/ma-user/work/MindSpeed-MM/save_dir/"
+LOAD_PATH="ckpt/mm_path/Qwen2-VL-7B-Instruct"
 
 TP=1
-PP=4
+PP=1
 CP=1
 SEQ_LEN=1024
 MBS=1
diff --git a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py
index 8c3b2d7b..bc30de4b 100644
--- a/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py
+++ b/examples/qwen2vl/llava_instruct_2_mllm_demo_format.py
@@ -2,9 +2,7 @@ import json
 import os
 import stat
 
-# llava_json_path = "./data/llava_instruct_150k_wo_img.json"
-# llava_json_path = "./data/ip.json"
-llava_json_path = "./data/full_data.json"
+llava_json_path = "./data/llava_instruct_150k.json"
 mllm_format_json_path = "./data/mllm_format_llava_instruct_data.json"
 
 with open(llava_json_path, "r") as f:
@@ -12,16 +10,15 @@ with open(llava_json_path, "r") as f:
 
 mllm_format_llava_instruct_data = []
 for item in info_json:
-    # img_path = os.path.join("./data/COCO2017/train2017", item["image"])
-    # img_path = os.path.join("./data/dummy", item["image"])
-    img_path = os.path.join("./data", item["image"])
-    if not os.path.exists(img_path):
-        continue
-    if not img_path.endswith(".jpg") and not img_path.endswith(".png"):
+    if item.get('image', None):
         new_item = {
             "images": [],
             "messages": []
         } 
+    img_path = os.path.join("./data/COCO2017/train2017", item["image"])
+    print(f"img_path: {img_path}")
+    if not os.path.exists(img_path):
+        continue
     else:
         new_item = {
             "images": [img_path],
@@ -39,7 +36,7 @@ for item in info_json:
 
 output_json = json.dumps(mllm_format_llava_instruct_data)
 if os.path.exists(mllm_format_json_path):
-    os.remove(mllm_format_json_path)
+    print(f"{mllm_format_json_path} already exists, please rename it or remove it")
 with os.fdopen(os.open(mllm_format_json_path, os.O_WRONLY | os.O_CREAT | os.O_EXCL, stat.S_IWUSR | stat.S_IRUSR), "w") as f:
     f.write(output_json)
 print(f"finish converting dataset into {mllm_format_json_path}")
diff --git a/examples/qwen2vl/model_2b.json b/examples/qwen2vl/model_2b.json
index e855d404..2433a4dd 100644
--- a/examples/qwen2vl/model_2b.json
+++ b/examples/qwen2vl/model_2b.json
@@ -24,7 +24,7 @@
             "freeze": true,
             "use_fused_rotary_pos_emb": true,
             "post_layer_norm": false,
-            "pipeline_num_layers": [32, 0, 0, 0]
+            "pipeline_num_layers": [32]
         },
         "vision_projector": {
             "model_id": "lnmlp",
@@ -45,7 +45,7 @@
     "text_decoder": {
         "model_id": "qwen2lm",
         "num_layers": 28,
-        "pipeline_num_layers": [1, 6, 11, 10],
+        "pipeline_num_layers": [28],
         "hidden_size": 1536,
         "ffn_hidden_size": 8960,
         "num_attention_heads": 12,
diff --git a/examples/qwen2vl/qwen2vl_convert_pp_to_pp.py b/examples/qwen2vl/qwen2vl_convert_pp_to_pp.py
new file mode 100644
index 00000000..94412980
--- /dev/null
+++ b/examples/qwen2vl/qwen2vl_convert_pp_to_pp.py
@@ -0,0 +1,33 @@
+from qwen2vl_convert_to_hf import load_from_mm, check_pp_config
+from qwen2vl_convert_to_mm_ckpt import split_model_by_pipeline, save_by_pp, merge_pp_index
+
+if __name__ == "__main__":
+    mm_save_dir = "save_dir"  # 微调后保存的权重目录
+    new_save_dir = "new_pp_save_dir"  # 希望重新pp切分后保存的目录
+
+    vit_num_layers = 32
+    llm_num_layers = 28
+
+    old_pp_size = 4
+    old_vit_pipeline_num_layers = [32, 0, 0, 0]
+    old_llm_pipeline_num_layers = [1, 6, 11, 10]
+
+    new_pp_size = 2
+    new_vit_pipeline_num_layers = [32, 0]
+    new_llm_pipeline_num_layers = [14, 14]
+
+    check_pp_config(old_pp_size, vit_num_layers, old_vit_pipeline_num_layers, llm_num_layers,
+                    old_llm_pipeline_num_layers)
+    check_pp_config(new_pp_size, vit_num_layers, new_vit_pipeline_num_layers, llm_num_layers,
+                    new_llm_pipeline_num_layers)
+    state_dict = load_from_mm(mm_save_dir, old_vit_pipeline_num_layers, old_llm_pipeline_num_layers)
+    pp_split = merge_pp_index(new_pp_size, vit_num_layers, new_vit_pipeline_num_layers, llm_num_layers,
+                              new_llm_pipeline_num_layers)
+    state_dicts, _ = split_model_by_pipeline(state_dict, pp_split)
+
+    for rank, pipeline_state_dict in enumerate(state_dicts):
+        print(20 * '#', f'stage {rank}', 20 * '#')
+        for key, value in pipeline_state_dict.items():
+            if value is not None:
+                print(key, value.shape)
+    save_by_pp(state_dicts, new_save_dir, _exists_ok=True)
diff --git a/examples/qwen2vl/qwen2vl_convert_to_hf.py b/examples/qwen2vl/qwen2vl_convert_to_hf.py
index 55bace3c..29a17f49 100644
--- a/examples/qwen2vl/qwen2vl_convert_to_hf.py
+++ b/examples/qwen2vl/qwen2vl_convert_to_hf.py
@@ -7,6 +7,39 @@ import mindspeed.megatron_adaptor  # noqa
 import torch
 from safetensors.torch import save_file
 
+MODEL_CONFIG_DICT = {
+    '2B': {
+        'model_size': '2B',
+        'vit_hidden_size': 1280,
+        'vit_num_attention_heads': 16,
+        'vit_num_layers': 32,
+        'llm_hidden_size': 1536,
+        'llm_num_query_groups': 2,
+        'llm_num_attention_heads': 12,
+        'llm_num_layers': 28,
+    },
+    '7B': {
+        'model_size': '7B',
+        'vit_hidden_size': 1280,
+        'vit_num_attention_heads': 16,
+        'vit_num_layers': 32,
+        'llm_hidden_size': 3584,
+        'llm_num_query_groups': 4,
+        'llm_num_attention_heads': 28,
+        'llm_num_layers': 28,
+    },
+    '72B': {
+        'model_size': '72B',
+        'vit_hidden_size': 1280,
+        'vit_num_attention_heads': 16,
+        'vit_num_layers': 32,
+        'llm_hidden_size': 8192,
+        'llm_num_query_groups': 8,
+        'llm_num_attention_heads': 64,
+        'llm_num_layers': 80,
+    }
+}
+
 
 def rename_pp_parameter(param_name: str, model_dir: Path, vit_pp_list: list[int], llm_pp_list: list[int]) -> str:
     index = int(model_dir.parent.stem.split('_')[-1])
@@ -29,14 +62,14 @@ def rename_pp_parameter(param_name: str, model_dir: Path, vit_pp_list: list[int]
     return param_name
 
 
-def load_from_mm(_load_dir, vit_pp_list, llm_pp_list):
+def load_from_mm(_load_dir: str, vit_pp_list: list[int], llm_pp_list: list[int]) -> dict:
     LATEST_TXT = "latest_checkpointed_iteration.txt"
     mm_save_dir = Path(_load_dir)
     save_iteration = mm_save_dir.joinpath(LATEST_TXT).read_text()
-    save_iter_dir = mm_save_dir.joinpath(f"iter_{int(save_iteration):07}")
+    save_dir = mm_save_dir.joinpath(f"iter_{int(save_iteration):07}" if save_iteration != "release" else save_iteration)
     state_dict = {}
-    print(str(save_iter_dir).center(100, "="))
-    for pt_path in save_iter_dir.glob("*/*.pt"):
+    print(str(save_dir).center(100, "="))
+    for pt_path in save_dir.glob("*/*.pt"):
         print(str(pt_path).center(100, '_'))
         state_dict.update(
             {rename_pp_parameter(param, pt_path, vit_pp_list, llm_pp_list): tensor
@@ -47,10 +80,23 @@ def load_from_mm(_load_dir, vit_pp_list, llm_pp_list):
     return state_dict
 
 
-def convert_mm_to_hf(_state_dict, _vit_hidden_size, _vit_attention_heads_num):
-    hiddensize_per_head = _vit_hidden_size // _vit_attention_heads_num
+def convert_mm_to_hf(_state_dict: dict, _model_config: dict) -> dict:
+    vit_hidden_size = _model_config['vit_hidden_size']
+    vit_num_attention_heads = _model_config['vit_num_attention_heads']
+    llm_hidden_size = _model_config['llm_hidden_size']
+    llm_num_attention_heads = _model_config['llm_num_attention_heads']
+    llm_num_query_groups = _model_config['llm_num_query_groups']
+
+    vit_head_hidden_size = vit_hidden_size // vit_num_attention_heads
+    llm_head_hidden_size = llm_hidden_size // llm_num_attention_heads
+    q_size = llm_head_hidden_size * llm_num_attention_heads // llm_num_query_groups
+    k_size = llm_head_hidden_size * llm_num_query_groups // llm_num_query_groups
+    v_size = llm_head_hidden_size * llm_num_query_groups // llm_num_query_groups
+
     new_params = {}
     for key, value in _state_dict.items():
+        if value is None:
+            continue
         new_key = None
         # image_encoder 权重转换部分
         if key.startswith('image_encoder'):
@@ -69,16 +115,16 @@ def convert_mm_to_hf(_state_dict, _vit_hidden_size, _vit_attention_heads_num):
             if 'qkv.weight' in new_key:
                 res = value * 0
                 i = 0
-                for j in range(_vit_attention_heads_num):
-                    q_part = value[i * hiddensize_per_head: (i + 1) * hiddensize_per_head, :]
-                    res[hiddensize_per_head * j: hiddensize_per_head * (j + 1), :] = q_part
+                for j in range(vit_num_attention_heads):
+                    q_part = value[i * vit_head_hidden_size: (i + 1) * vit_head_hidden_size, :]
+                    res[vit_head_hidden_size * j: vit_head_hidden_size * (j + 1), :] = q_part
 
-                    k_part = value[(i + 1) * hiddensize_per_head: (i + 2) * hiddensize_per_head, :]
-                    res[_vit_hidden_size + hiddensize_per_head * j: _vit_hidden_size + hiddensize_per_head * (j + 1),
+                    k_part = value[(i + 1) * vit_head_hidden_size: (i + 2) * vit_head_hidden_size, :]
+                    res[vit_hidden_size + vit_head_hidden_size * j: vit_hidden_size + vit_head_hidden_size * (j + 1),
                     :] = k_part
 
-                    v_part = value[(i + 2) * hiddensize_per_head: (i + 3) * hiddensize_per_head, :]
-                    res[_vit_hidden_size * 2 + hiddensize_per_head * j: _vit_hidden_size * 2 + hiddensize_per_head * (
+                    v_part = value[(i + 2) * vit_head_hidden_size: (i + 3) * vit_head_hidden_size, :]
+                    res[vit_hidden_size * 2 + vit_head_hidden_size * j: vit_hidden_size * 2 + vit_head_hidden_size * (
                             j + 1), :] = v_part
 
                     i = i + 3
@@ -86,17 +132,17 @@ def convert_mm_to_hf(_state_dict, _vit_hidden_size, _vit_attention_heads_num):
             elif 'qkv.bias' in new_key:
                 res = value * 0
                 i = 0
-                for j in range(_vit_attention_heads_num):
-                    q_part = value[i * hiddensize_per_head: (i + 1) * hiddensize_per_head]
-                    res[hiddensize_per_head * j: hiddensize_per_head * (j + 1)] = q_part
+                for j in range(vit_num_attention_heads):
+                    q_part = value[i * vit_head_hidden_size: (i + 1) * vit_head_hidden_size]
+                    res[vit_head_hidden_size * j: vit_head_hidden_size * (j + 1)] = q_part
 
-                    k_part = value[(i + 1) * hiddensize_per_head: (i + 2) * hiddensize_per_head]
-                    res[_vit_hidden_size + hiddensize_per_head * j: _vit_hidden_size + hiddensize_per_head * (
+                    k_part = value[(i + 1) * vit_head_hidden_size: (i + 2) * vit_head_hidden_size]
+                    res[vit_hidden_size + vit_head_hidden_size * j: vit_hidden_size + vit_head_hidden_size * (
                             j + 1)] = k_part
 
-                    v_part = value[(i + 2) * hiddensize_per_head: (i + 3) * hiddensize_per_head]
+                    v_part = value[(i + 2) * vit_head_hidden_size: (i + 3) * vit_head_hidden_size]
                     res[
-                    _vit_hidden_size * 2 + hiddensize_per_head * j: _vit_hidden_size * 2 + hiddensize_per_head * (
+                    vit_hidden_size * 2 + vit_head_hidden_size * j: vit_hidden_size * 2 + vit_head_hidden_size * (
                             j + 1)] = v_part
 
                     i = i + 3
@@ -106,55 +152,27 @@ def convert_mm_to_hf(_state_dict, _vit_hidden_size, _vit_attention_heads_num):
                     new_params[new_key] = value
 
         else:
-            if 'self_attention.linear_qkv.weight' in key:
-                qkv_chunks = torch.chunk(value, 4, dim=0)
-                # qkv的结构是[896(q)+128(k)+128(v)]*4
-                indices = [896, 1024]
-                indices = [0] + indices + [qkv_chunks[0].size(0)]
-                q_chunks = []
-                k_chunks = []
-                v_chunks = []
-                for j in range(4):
-                    splits = [qkv_chunks[j][indices[i]:indices[i + 1]] for i in range(len(indices) - 1)]
-                    q_chunks.append(splits[0])
-                    k_chunks.append(splits[1])
-                    v_chunks.append(splits[2])
-
-                attention_q_weight = torch.cat(q_chunks, dim=0)
-                attention_k_weight = torch.cat(k_chunks, dim=0)
-                attention_v_weight = torch.cat(v_chunks, dim=0)
-
-                layer = key.split('.')[3]
-                attention_q = f'model.layers.{layer}.self_attn.q_proj.weight'
-                attention_k = f'model.layers.{layer}.self_attn.k_proj.weight'
-                attention_v = f'model.layers.{layer}.self_attn.v_proj.weight'
-
-                new_params[attention_q] = attention_q_weight
-                new_params[attention_k] = attention_k_weight
-                new_params[attention_v] = attention_v_weight
-
-            elif 'self_attention.linear_qkv.bias' in key:
-                qkv_chunks = torch.chunk(value, 4, dim=0)
-                # qkv的结构是[896(q)+128(k)+128(v)]*4
-                indices = [896, 1024]
-                indices = [0] + indices + [qkv_chunks[0].size(0)]
+            # self_attention.linear_qkv.weight 和 self_attention.linear_qkv.bias
+            if 'self_attention.linear_qkv' in key:
+                qkv_chunks = torch.chunk(value, llm_num_query_groups, dim=0)
                 q_chunks = []
                 k_chunks = []
                 v_chunks = []
-                for j in range(4):
-                    splits = [qkv_chunks[j][indices[i]:indices[i + 1]] for i in range(len(indices) - 1)]
-                    q_chunks.append(splits[0])
-                    k_chunks.append(splits[1])
-                    v_chunks.append(splits[2])
+                for chunk in qkv_chunks:
+                    q_chunk, k_chunk, v_chunk = torch.split(chunk, [q_size, k_size, v_size], dim=0)
+                    q_chunks.append(q_chunk)
+                    k_chunks.append(k_chunk)
+                    v_chunks.append(v_chunk)
 
                 attention_q_weight = torch.cat(q_chunks, dim=0)
                 attention_k_weight = torch.cat(k_chunks, dim=0)
                 attention_v_weight = torch.cat(v_chunks, dim=0)
 
                 layer = key.split('.')[3]
-                attention_q = f'model.layers.{layer}.self_attn.q_proj.bias'
-                attention_k = f'model.layers.{layer}.self_attn.k_proj.bias'
-                attention_v = f'model.layers.{layer}.self_attn.v_proj.bias'
+                name = key.split('.')[-1]  # weight或bias
+                attention_q = f'model.layers.{layer}.self_attn.q_proj.{name}'
+                attention_k = f'model.layers.{layer}.self_attn.k_proj.{name}'
+                attention_v = f'model.layers.{layer}.self_attn.v_proj.{name}'
 
                 new_params[attention_q] = attention_q_weight
                 new_params[attention_k] = attention_k_weight
@@ -212,7 +230,7 @@ def copy_except_safetensors(src_dir: str, dst_dir: str) -> None:
                 shutil.copy2(src_file, dst_file)
 
 
-def check_pp_config(_pp_size, _vit_num_layers, _vit_pipeline_num_layers, _llm_num_layers, _llm_pipeline_num_layers):
+def check_pp_config(_pp_size: int, _vit_num_layers: int, _vit_pipeline_num_layers: list[int], _llm_num_layers: int, _llm_pipeline_num_layers: list[int]) -> None:
     if len(_vit_pipeline_num_layers) != _pp_size:
         raise AssertionError(f'length of vit_pipeline_num_layers must be equal to pp_size, '
                              f'but got {len(_vit_pipeline_num_layers)} and {_pp_size}.')
@@ -227,7 +245,7 @@ def check_pp_config(_pp_size, _vit_num_layers, _vit_pipeline_num_layers, _llm_nu
                              f'but got {sum(_llm_pipeline_num_layers)} and {_llm_num_layers}.')
 
 
-def split_by_index_json(_state_dict, _model_path):
+def split_by_index_json(_state_dict: dict, _model_path: str) -> list[dict]:
     index_json_path = os.path.join(_model_path, 'model.safetensors.index.json')
     return_dicts = []
     with open(index_json_path, 'r', encoding='utf-8') as file:
@@ -240,7 +258,7 @@ def split_by_index_json(_state_dict, _model_path):
     return return_dicts
 
 
-def save_by_index_json(_state_dicts, _save_dir):
+def save_by_index_json(_state_dicts: list[dict], _save_dir: str) -> None:
     metadata = {
         'format': 'pt'
     }
@@ -250,22 +268,21 @@ def save_by_index_json(_state_dicts, _save_dir):
 
 
 if __name__ == "__main__":
-    mm_save_dir = "save_dir"                # 微调后保存的权重目录
-    hg_save_dir = "Qwen2-VL-7B-Save"        # 希望保存的hf目录
-    model_path = "Qwen2-VL-7B-Instruct"     # hf原仓目录
-
+    mm_save_dir = "save_dir"  # 微调后保存的权重目录
+    hf_save_dir = "Qwen2-VL-7B-Save"  # 希望保存的hf目录
+    model_path = "ckpt/hf_path/Qwen2-VL-7B-Instruct"  # hf原仓目录
+    model_size = "7B"  # 根据需要转换的模型，指定配置（ 2B 7B 72B ）
+    #model parameters
+    model_config = MODEL_CONFIG_DICT[model_size]
+
+    #PP parameters: 7B
     pp_size = 4
-    vit_num_layers = 32
     vit_pipeline_num_layers = [32, 0, 0, 0]
-    llm_num_layers = 28
     llm_pipeline_num_layers = [1, 6, 11, 10]
 
-    vit_hidden_size = 1280
-    vit_attention_heads_num = 16
-
-    check_pp_config(pp_size, vit_num_layers, vit_pipeline_num_layers, llm_num_layers, llm_pipeline_num_layers)
+    check_pp_config(pp_size, model_config["vit_num_layers"], vit_pipeline_num_layers, model_config["llm_num_layers"], llm_pipeline_num_layers)
     state_dict = load_from_mm(mm_save_dir, vit_pipeline_num_layers, llm_pipeline_num_layers)
-    state_dict = convert_mm_to_hf(state_dict, vit_hidden_size, vit_attention_heads_num)
+    state_dict = convert_mm_to_hf(state_dict, model_config)
     state_dicts = split_by_index_json(state_dict, model_path)
-    copy_except_safetensors(model_path, hg_save_dir)
-    save_by_index_json(state_dicts, hg_save_dir)
+    copy_except_safetensors(model_path, hf_save_dir)
+    save_by_index_json(state_dicts, hf_save_dir)
diff --git a/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py b/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py
index b85e6c5f..e5f6702e 100644
--- a/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py
+++ b/examples/qwen2vl/qwen2vl_convert_to_mm_ckpt.py
@@ -6,17 +6,53 @@ from copy import deepcopy
 import torch
 from safetensors.torch import load_file
 
+MODEL_CONFIG_DICT = {
+    '2B': {
+        'model_size': '2B',
+        'vit_hidden_size': 1280,
+        'vit_num_attention_heads': 16,
+        'vit_num_layers': 32,
+        'llm_hidden_size': 1536,
+        'llm_num_query_groups': 2,
+        'llm_num_attention_heads': 12,
+        'llm_num_layers': 28,
+    },
+    '7B': {
+        'model_size': '7B',
+        'vit_hidden_size': 1280,
+        'vit_num_attention_heads': 16,
+        'vit_num_layers': 32,
+        'llm_hidden_size': 3584,
+        'llm_num_query_groups': 4,
+        'llm_num_attention_heads': 28,
+        'llm_num_layers': 28,
+    },
+    '72B': {
+        'model_size': '72B',
+        'vit_hidden_size': 1280,
+        'vit_num_attention_heads': 16,
+        'vit_num_layers': 32,
+        'llm_hidden_size': 8192,
+        'llm_num_query_groups': 8,
+        'llm_num_attention_heads': 64,
+        'llm_num_layers': 80,
+    }
+}
+
 
 def load_from_hf(_load_dir):
     # Load Huggingface model 。
     load_dir = Path(_load_dir)
+    safetensors_files = list(load_dir.glob("*.safetensors"))
+    if not safetensors_files:
+        raise FileNotFoundError(f"No *.safetensors files found in {load_dir}")
     state_dict = {}
-    for safe_path in load_dir.glob("*.safetensors"):
+    for safe_path in safetensors_files:
         state_dict.update(load_file(str(safe_path), device='cpu'))
     return state_dict
 
 
-def convert_hg_to_mm(_state_dict, _num_layers, _vit_hidden_size, _vit_attention_heads_num):
+def convert_hf_to_mm(_state_dict, _num_layers, _vit_hidden_size, _vit_attention_heads_num, _llm_num_query_groups):
     hiddensize_per_head = _vit_hidden_size // _vit_attention_heads_num
     new_params = {}
     for key, value in _state_dict.items():
@@ -132,11 +168,11 @@ def convert_hg_to_mm(_state_dict, _num_layers, _vit_hidden_size, _vit_attention_
         if attention_v in new_params.keys():
             attention_v_weight = new_params[attention_v]
 
-        q_chunks = torch.chunk(attention_q_weight, 4, dim=0)
-        k_chunks = torch.chunk(attention_k_weight, 4, dim=0)
-        v_chunks = torch.chunk(attention_v_weight, 4, dim=0)
+        q_chunks = torch.chunk(attention_q_weight, _llm_num_query_groups, dim=0)
+        k_chunks = torch.chunk(attention_k_weight, _llm_num_query_groups, dim=0)
+        v_chunks = torch.chunk(attention_v_weight, _llm_num_query_groups, dim=0)
         all_chunks = []
-        for j in range(4):
+        for j in range(_llm_num_query_groups):
             all_chunks.append(q_chunks[j])
             all_chunks.append(k_chunks[j])
             all_chunks.append(v_chunks[j])
@@ -168,11 +204,11 @@ def convert_hg_to_mm(_state_dict, _num_layers, _vit_hidden_size, _vit_attention_
         else:
             continue
 
-        q_chunks1 = torch.chunk(attention_q_bias, 4, dim=0)
-        k_chunks1 = torch.chunk(attention_k_bias, 4, dim=0)
-        v_chunks1 = torch.chunk(attention_v_bias, 4, dim=0)
+        q_chunks1 = torch.chunk(attention_q_bias, _llm_num_query_groups, dim=0)
+        k_chunks1 = torch.chunk(attention_k_bias, _llm_num_query_groups, dim=0)
+        v_chunks1 = torch.chunk(attention_v_bias, _llm_num_query_groups, dim=0)
         all_chunks1 = []
-        for j in range(4):
+        for j in range(_llm_num_query_groups):
             all_chunks1.append(q_chunks1[j])
             all_chunks1.append(k_chunks1[j])
             all_chunks1.append(v_chunks1[j])
@@ -187,6 +223,7 @@ def convert_hg_to_mm(_state_dict, _num_layers, _vit_hidden_size, _vit_attention_
 
     return new_params
 
+
 def merge_pp_index(pp_size, vit_num_layers, vit_pipeline_num_layers, llm_num_layers, llm_pipeline_num_layers):
     if len(vit_pipeline_num_layers) != pp_size:
         raise AssertionError(f'length of vit_pipeline_num_layers must be equal to pp_size, '
@@ -205,10 +242,11 @@ def merge_pp_index(pp_size, vit_num_layers, vit_pipeline_num_layers, llm_num_lay
         split_method.append((vit_num, llm_num))
     return split_method
 
+
 def split_model_by_pipeline(state_dict, pp_split):
     if pp_split is None or len(pp_split) <= 1:
         return [state_dict], {}
-    
+
     pp_size = len(pp_split)
     vit_range = [0, 0]
     llm_range = [pp_size - 1, pp_size - 1]
@@ -219,7 +257,7 @@ def split_model_by_pipeline(state_dict, pp_split):
             llm_range[0] = pp_rank
     print(f'vit range: {vit_range[0]}~{vit_range[1]}')
     print(f'llm range: {llm_range[0]}~{llm_range[1]}')
-    
+
     vit_start_idx = 0
     llm_start_idx = 0
     return_dicts = []
@@ -267,7 +305,8 @@ def split_model_by_pipeline(state_dict, pp_split):
         llm_start_idx = llm_end_idx
         return_dicts.append(new_dict)
     return return_dicts, copy_dict
-    
+
+
 def save_by_pp(_state_dicts, _save_dir, _lastest_checkpointed_iteration='release', _exists_ok=False):
     if os.path.exists(_save_dir):
         if not _exists_ok:
@@ -305,24 +344,20 @@ def save_by_pp(_state_dicts, _save_dir, _lastest_checkpointed_iteration='release
 
 
 if __name__ == "__main__":
-    # hg_ckpt_dir = "Qwen2-VL-7B-Instruct"
-    # mm_save_dir = 'ckpt/Qwen2-VL-7B-Instruct'
-    hg_ckpt_dir = 'ckpt/hf_path/Qwen2-VL-7B-Instruct'  # huggingface权重目录
-    mm_save_dir = 'ckpt/mm_path/Qwen2-VL-7B-Instruct'  # 转换后保存目录
-
-    vit_hidden_size = 1280
-    vit_attention_heads_num = 16
+    hf_ckpt_dir = "ckpt/hf_path/Qwen2-VL-7B-Instruct"  # hugging face原始的权重保存路径
+    mm_save_dir = 'ckpt/mm_path/Qwen2-VL-7B-Instruct'  # 转换后的权重保存路径
+    model_size = "7B"  # 根据需要转换的模型，指定配置（ 2B 7B 72B ）
+    #model parameters
+    model_config = MODEL_CONFIG_DICT[model_size]
 
-    #for 7B
+    #PP parameters: 7B
     pp_size = 4
-    vit_num_layers = 32
     vit_pipeline_num_layers = [32, 0, 0, 0]
-    llm_num_layers = 28
     llm_pipeline_num_layers = [1, 6, 11, 10]
 
-    state_dict = load_from_hf(hg_ckpt_dir)
-    state_dict = convert_hg_to_mm(state_dict, llm_num_layers, vit_hidden_size, vit_attention_heads_num)
-    pp_split = merge_pp_index(pp_size, vit_num_layers, vit_pipeline_num_layers, llm_num_layers, llm_pipeline_num_layers)
+    state_dict = load_from_hf(hf_ckpt_dir)
+    state_dict = convert_hf_to_mm(state_dict, model_config["llm_num_layers"], model_config["vit_hidden_size"], model_config["vit_num_attention_heads"], model_config["llm_num_query_groups"])
+    pp_split = merge_pp_index(pp_size, model_config["vit_num_layers"], vit_pipeline_num_layers, model_config["llm_num_layers"], llm_pipeline_num_layers)
     state_dicts, remains = split_model_by_pipeline(state_dict, pp_split)
     if len(remains) > 0:
         print(remains)
diff --git a/inference_vlm.py b/inference_vlm.py
index 4024f046..d168bfd1 100644
--- a/inference_vlm.py
+++ b/inference_vlm.py
@@ -1,113 +1,26 @@
-import os
 import torch
-from time import time, sleep
-from concurrent.futures import ThreadPoolExecutor
 
 import mindspeed.megatron_adaptor
 from megatron.training import get_args
 from mindspeed_mm.tasks.inference.pipeline import vlm_pipeline_dict
 from mindspeed_mm.configs.config import mm_extra_args_provider
 
-from redis_utils import RedisManager
-
-
-def inference_and_write(batch_requests, pipeline, redis: RedisManager, model_name: str):
-    # Extract input data
-    sys_prompts = [req["sys_prompt"] for req in batch_requests]
-    queries = [req["query"] for req in batch_requests]
-    main_ids = [req["message_id"] for req in batch_requests]
-
-    full_queries = sys_prompts[0] + queries[0]
-
-    start_time = time()
-    outputs = pipeline(prompt=full_queries, return_ids=True)
-    print(f"Inference time: {time() - start_time}")
-
-    if not isinstance(outputs, list):
-        outputs = [outputs]
-    print(outputs)
-
-    def write_to_redis(message_id, output):
-        while True:
-            try:
-                redis.write_data_to_result(model_name, message_id, {"output": output})
-                break
-            except Exception as e:
-                continue
-
-    # Multithreaded writing to Redis
-    with ThreadPoolExecutor() as executor:
-        futures = [
-            executor.submit(write_to_redis, message_id, output)
-            for message_id, output in zip(main_ids, outputs)
-        ]
-
-        # Wait until all futures are completed
-        for future in futures:
-            future.result()
-
 
 def main():
     from megatron.training.initialize import initialize_megatron
     from mindspeed_mm.configs.config import merge_mm_args
 
+    # just inference
     torch.set_grad_enabled(False)
 
     initialize_megatron(
-        extra_args_provider=mm_extra_args_provider,
-        args_defaults={"tokenizer_type": "GPT2BPETokenizer"},
+        extra_args_provider=mm_extra_args_provider, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}
     )
     args = get_args()
     merge_mm_args(args)
     inference_config = args.mm.model
-
-    pipeline = vlm_pipeline_dict[inference_config.pipeline_class](inference_config)
-
-    # Redis
-    assert os.environ[
-        "MODEL_NAME"
-    ], "Environment variable MODEL_NAME was not set. Please set it manually."
-    model_name = os.environ["MODEL_NAME"]
-    assert os.environ[
-        "REDIS_URL"
-    ], "Environment variable REDIS_URL was not set. Please set it manually."
-    redis_url = os.environ["REDIS_URL"]
-    assert os.environ[
-        "REDIS_PORT"
-    ], "Environment variable REDIS_PORT was not set. Please set it manually."
-    redis_port = os.environ["REDIS_PORT"]
-    assert os.environ[
-        "REDIS_DB"
-    ], "Environment variable REDIS_DB was not set. Please set it manually."
-    redis_db = os.environ["REDIS_DB"]
-    redis = RedisManager(
-        host=redis_url, port=redis_port, db=redis_db, model_name=model_name
-    )
-
-    batch_size = 1  # Define your batch size
-
-    while True:
-        try:
-            batch_requests = redis.fetch_batch_of_requests(batch_size)
-        except Exception as e:
-            continue
-
-        if batch_requests:
-            # Padding to batch size
-            while len(batch_requests) < batch_size:
-                batch_requests.append(
-                    {
-                        "query": "",
-                        "sys_prompt": "",
-                        "message_id": "dummy",
-                    }
-                )
-
-            inference_and_write(batch_requests, pipeline, redis, model_name)
-        else:
-            sleep(0.01)
+    vlm_pipeline_dict[inference_config.pipeline_class](inference_config)()
 
 
-if __name__ == "__main__":
-    with torch.inference_mode():
-        main()
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/mindspeed_mm/models/ae/contextparallelcausalvae.py b/mindspeed_mm/models/ae/contextparallelcausalvae.py
index f273eaf5..bad49ca0 100644
--- a/mindspeed_mm/models/ae/contextparallelcausalvae.py
+++ b/mindspeed_mm/models/ae/contextparallelcausalvae.py
@@ -2,6 +2,7 @@ from typing import Tuple
 
 import os
 import torch
+import safetensors
 from torch import nn
 from einops import rearrange
 import numpy as np
@@ -214,6 +215,8 @@ class ContextParallelCasualVAE(MultiModalModule):
             return torch.tensor_split(x, split_size, dim=0)
 
     def encode(self, x, enable_cp=True):
+        if self.cp_size <= 1:
+            enable_cp = False
         if not enable_cp:
             return self._encode(x, enable_cp=False)
 
@@ -225,7 +228,7 @@ class ContextParallelCasualVAE(MultiModalModule):
             data_list = data_list[::self.dp_group_nums]
             latents = []
             for data in data_list:
-                latents.append(self._encode(data))
+                latents.append(self._encode(data, enable_cp=enable_cp))
             return latents[get_context_parallel_group_rank() % self.dp_group_nums]
 
         elif self.dp_group_nums % self.cp_size == 0 and self.cp_size < self.dp_group_nums:
@@ -234,7 +237,7 @@ class ContextParallelCasualVAE(MultiModalModule):
             data_list = self._bs_split_and_pad(x, self.dp_group_nums // self.cp_size)
             data = data_list[get_context_parallel_rank() % (self.dp_group_nums // self.cp_size)]
 
-            _latent = self._encode(data)
+            _latent = self._encode(data, enable_cp=enable_cp)
 
             if mpu.get_tensor_model_parallel_world_size() > 1:
                 latents_tp = [torch.empty_like(_latent) for _ in range(mpu.get_tensor_model_parallel_world_size())]
@@ -254,7 +257,7 @@ class ContextParallelCasualVAE(MultiModalModule):
             return latents[:bs]
 
         elif self.cp_size == self.dp_group_nums:
-            return self._encode(x)
+            return self._encode(x, enable_cp=enable_cp)
         else:
             raise NotImplementedError(f"Not supported megatron data parallel group nums {self.dp_group_nums} and VAE cp_size {self.cp_size}!")
 
@@ -280,8 +283,10 @@ class ContextParallelCasualVAE(MultiModalModule):
 
         return res
 
-    def decode(self, z, **kwargs):
-        if self.cp_size > 0:            
+    def decode(self, z, enable_cp: bool = True, **kwargs):
+        if self.cp_size <= 1:
+            enable_cp = False
+        if self.cp_size > 0 and enable_cp:
             global_src_rank = get_context_parallel_group_rank() * self.cp_size
             torch.distributed.broadcast(z, src=global_src_rank, group=get_context_parallel_group())
 
@@ -291,13 +296,13 @@ class ContextParallelCasualVAE(MultiModalModule):
             if (z.shape[-1] > self.tile_latent_min_size
                     or z.shape[-2] > self.tile_latent_min_size
                     or z.shape[-3] > self.tile_latent_min_size_t):
-                dec = self.tiled_decode(z)
+                dec = self.tiled_decode(z, enable_cp=enable_cp)
         else:
             if self.use_quant_layer:
                 z = self.post_quant_conv(z)
-            dec = self.decoder(z)
+            dec = self.decoder(z, enable_cp=enable_cp)
 
-        if self.cp_size > 0:
+        if self.cp_size > 0 and enable_cp:
             dec = _conv_gather(dec, dim=2, kernel_size=1)
 
         return dec
@@ -349,7 +354,7 @@ class ContextParallelCasualVAE(MultiModalModule):
         posterior = DiagonalGaussianDistribution(moments)
         return posterior
 
-    def tiled_decode(self, x):
+    def tiled_decode(self, x, enable_cp=True):
         t = x.shape[2]
         t_chunk_idx = [i for i in range(0, t, self.tile_latent_min_size_t - 1)]
         if len(t_chunk_idx) == 1 and t_chunk_idx[0] == 0:
@@ -365,9 +370,9 @@ class ContextParallelCasualVAE(MultiModalModule):
         for idx, (start, end) in enumerate(t_chunk_start_end):
             chunk_x = x[:, :, start: end]
             if idx != 0:
-                dec = self.tiled_decode2d(chunk_x)[:, :, 1:]
+                dec = self.tiled_decode2d(chunk_x, enable_cp=enable_cp)[:, :, 1:]
             else:
-                dec = self.tiled_decode2d(chunk_x)
+                dec = self.tiled_decode2d(chunk_x, enable_cp=enable_cp)
             dec_.append(dec)
         dec_ = torch.cat(dec_, dim=2)
         return dec_
@@ -410,7 +415,7 @@ class ContextParallelCasualVAE(MultiModalModule):
             return moments
         return posterior
 
-    def tiled_decode2d(self, z):
+    def tiled_decode2d(self, z, enable_cp=True):
         overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
         blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
         row_limit = self.tile_sample_min_size - blend_extent
@@ -427,7 +432,7 @@ class ContextParallelCasualVAE(MultiModalModule):
                        ]
                 if self.use_quant_layer:
                     tile = self.post_quant_conv(tile)
-                decoded = self.decoder(tile)
+                decoded = self.decoder(tile, enable_cp=enable_cp)
                 row.append(decoded)
             rows.append(row)
         result_rows = []
@@ -605,16 +610,18 @@ class Encoder(nn.Module):
         )
 
     def forward(self, x, enable_cp=True):
-        h = self.conv_in(x, enable_cp=enable_cp)
+        hs = [self.conv_in(x, enable_cp=enable_cp)]
 
         for i_level in range(self.num_resolutions):
             for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](h, enable_cp=enable_cp)
+                h = self.down[i_level].block[i_block](hs[-1], enable_cp=enable_cp)
                 if len(self.down[i_level].attn) > 0:
                     h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
             if i_level != self.num_resolutions - 1:
-                h = self.down[i_level].downsample(h)
+                hs.append(self.down[i_level].downsample(hs[-1]))
 
+        h = hs[-1]
         h = self.mid.block_1(h, enable_cp=enable_cp)
         if self.enbale_attn1:
             h = self.mid.attn_1(h, enable_cp=enable_cp)
@@ -753,26 +760,26 @@ class Decoder(nn.Module):
             block_in, 3, kernel_size=3, padding=conv_padding
         )
 
-    def forward(self, z, **kwargs):
+    def forward(self, z, enable_cp=True, **kwargs):
         zq = z
 
         h = self.conv_in(z)
-        h = self.mid.block_1(h, zq=zq)
+        h = self.mid.block_1(h, zq=zq, enable_cp=enable_cp)
         if self.enable_attention:
             h = self.mid.attn_1(h)
-        h = self.mid.block_2(h, zq=zq)
+        h = self.mid.block_2(h, zq=zq, enable_cp=enable_cp)
 
         for i_level in reversed(range(self.num_resolutions)):
             for i_block in range(self.num_res_blocks + 1):
-                h = self.up[i_level].block[i_block](h, zq=zq)
+                h = self.up[i_level].block[i_block](h, zq=zq, enable_cp=enable_cp)
                 if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h, zq=zq)
+                    h = self.up[i_level].attn[i_block](h, zq=zq, enable_cp=enable_cp)
             if hasattr(self.up[i_level], "upsample"):
-                h = self.up[i_level].upsample(h)
+                h = self.up[i_level].upsample(h, enable_cp=enable_cp)
             if hasattr(self.up[i_level], "time_upsample"):
                 h = self.up[i_level].time_upsample(h)
 
-        h = self.norm_out(h, zq=zq)
+        h = self.norm_out(h, zq=zq, enable_cp=enable_cp)
         if self.enable_nonlinearity:
             h = self.nonlinearity(h)
         h = self.conv_out(h)
diff --git a/mindspeed_mm/models/common/attention.py b/mindspeed_mm/models/common/attention.py
index a0dfebb1..8bb7da0b 100644
--- a/mindspeed_mm/models/common/attention.py
+++ b/mindspeed_mm/models/common/attention.py
@@ -398,6 +398,10 @@ class SelfAttentionBNSD(nn.Module):
         if self.qk_ln:
             self.q_norm = nn.LayerNorm(self.head_dim, eps=1e-6)
             self.k_norm = nn.LayerNorm(self.head_dim, eps=1e-6)
+            for param in self.q_norm.parameters():
+                setattr(param, "sequence_parallel", True)
+            for param in self.k_norm.parameters():
+                setattr(param, "sequence_parallel", True)
 
         key_dim = key_dim if key_dim is not None else query_dim
 
@@ -438,6 +442,7 @@ class SelfAttentionBNSD(nn.Module):
         frames: Optional[int] = None,
         height: Optional[int] = None,
         width: Optional[int] = None,
+        **kwargs
     ) -> torch.Tensor:
         """
         Args:
@@ -467,8 +472,8 @@ class SelfAttentionBNSD(nn.Module):
             k = self.k_norm(k)
 
         if self.use_rope and self.rope is not None:
-            q = self.rope(q)
-            k = self.rope(k)
+            q = self.rope(q, **kwargs)
+            k = self.rope(k, **kwargs)
 
         out = torch_npu.npu_fusion_attention(
             q,
@@ -529,6 +534,10 @@ class ParallelSelfAttentionSBH(nn.Module):
         if self.qk_ln:
             self.q_norm = nn.LayerNorm(self.head_dim, eps=1e-6)
             self.k_norm = nn.LayerNorm(self.head_dim, eps=1e-6)
+            for param in self.q_norm.parameters():
+                setattr(param, "sequence_parallel", True)
+            for param in self.k_norm.parameters():
+                setattr(param, "sequence_parallel", True)
 
         key_dim = key_dim if key_dim is not None else query_dim
 
@@ -577,9 +586,10 @@ class ParallelSelfAttentionSBH(nn.Module):
             mask: The attention mask to use.
             **kwargs: Additional keyword arguments to pass along
         """
-        sequence_length, batch_size, _ = query.shape
 
         q, k, v = self.proj_qkv(query)[0].chunk(3, dim=2)
+        sequence_length, batch_size, _ = q.shape
+
         q = q.view(-1, self.num_attention_heads_per_partition, self.head_dim)
         k = k.view(-1, self.num_attention_heads_per_partition, self.head_dim)
         v = v.view(-1, self.num_attention_heads_per_partition, self.head_dim)
@@ -673,7 +683,8 @@ class ParallelMultiHeadAttentionSBH(nn.Module):
         mask: Optional[torch.Tensor] = None,
         frames: Optional[int] = None,
         height: Optional[int] = None,
-        width: Optional[int] = None
+        width: Optional[int] = None,
+        **kwargs
     ) -> torch.Tensor:
         """
         Args:
diff --git a/mindspeed_mm/models/common/embeddings/patch_embeddings.py b/mindspeed_mm/models/common/embeddings/patch_embeddings.py
index 2c734897..c92eeda2 100644
--- a/mindspeed_mm/models/common/embeddings/patch_embeddings.py
+++ b/mindspeed_mm/models/common/embeddings/patch_embeddings.py
@@ -5,6 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 
 
+from functools import reduce
+from operator import mul
+
 import torch
 import torch.nn.functional as F
 from einops import rearrange
@@ -14,7 +17,6 @@ from torch import nn
 from .pos_embeddings import (
     get_1d_sincos_pos_embed,
     get_2d_sincos_pos_embed,
-    get_3d_sincos_pos_embed,
 )
 
 
@@ -598,7 +600,7 @@ class VideoPatch2D(nn.Module):
         super().__init__()
         self.proj = nn.Conv2d(in_channels, hidden_size, kernel_size=patch_size, stride=patch_size, bias=bias)
 
-    def forward(self, latent, encoder_outputs):
+    def forward(self, latent, encoder_outputs, **kwargs):
         latent = latent.transpose(1, 2)
         b, t = latent.shape[:2]
         emb = latent.view(-1, *latent.shape[2:])
@@ -608,4 +610,37 @@ class VideoPatch2D(nn.Module):
         emb = rearrange(emb, "b t n d -> b (t n) d")
 
         emb = emb.contiguous()
-        return emb, None  # (b,n_t+t*n_i,d)
\ No newline at end of file
+        return emb, None  # (b,n_t+t*n_i,d)
+
+
+class VideoPatch3D(nn.Module):
+    """
+    3D Image to Patch Embedding concat witch text embedding
+    """
+    def __init__(
+        self,
+        in_channels,
+        hidden_size,
+        patch_size,
+    ):
+        super().__init__()
+        self.patch_size = patch_size
+        self.proj = nn.Linear(in_channels * reduce(mul, patch_size), hidden_size)
+
+    def forward(self, latent, encoder_outputs, **kwargs):
+        latent = latent.transpose(1, 2)
+        emb = rearrange(latent, "b t c h w -> b (t h w) c")
+        emb = rearrange(
+            emb,
+            "b (t o h p w q) c -> b (t h w) (c o p q)",
+            t=kwargs["rope_T"],
+            h=kwargs["rope_H"],
+            w=kwargs["rope_W"],
+            o=self.patch_size[0],
+            p=self.patch_size[1],
+            q=self.patch_size[2],
+        )
+        emb = self.proj(emb)
+        emb = emb.contiguous()
+
+        return emb, None
\ No newline at end of file
diff --git a/mindspeed_mm/models/common/embeddings/pos_embeddings.py b/mindspeed_mm/models/common/embeddings/pos_embeddings.py
index ca8dc26d..420d2e6c 100644
--- a/mindspeed_mm/models/common/embeddings/pos_embeddings.py
+++ b/mindspeed_mm/models/common/embeddings/pos_embeddings.py
@@ -531,13 +531,10 @@ class Rotary3DPositionEmbedding(nn.Module):
         freqs_w = repeat(freqs_w, "... n -> ... (n r)", r=2)
 
         freqs = broad_cat((freqs_t[:, None, None, :], freqs_h[None, :, None, :], freqs_w[None, None, :, :]), dim=-1)
-        freqs = rearrange(freqs, "t h w d -> (t h w) d")
 
         freqs = freqs.contiguous()
-        freqs_sin = freqs.sin()
-        freqs_cos = freqs.cos()
-        self.register_buffer("freqs_sin", freqs_sin)
-        self.register_buffer("freqs_cos", freqs_cos)
+        self.freqs_sin = freqs.sin().npu()
+        self.freqs_cos = freqs.cos().npu()
 
         self.text_length = text_length
         if learnable_pos_embed:
@@ -546,11 +543,16 @@ class Rotary3DPositionEmbedding(nn.Module):
         else:
             self.pos_embedding = None
 
-    def rotary(self, t):
+    def rotary(self, t, **kwargs):
         # input shape: bnsd
-        seq_len = t.shape[2]
-        freqs_cos = self.freqs_cos[:seq_len].unsqueeze(0).unsqueeze(0)
-        freqs_sin = self.freqs_sin[:seq_len].unsqueeze(0).unsqueeze(0)
+        def reshape_freq(freqs):
+            freqs = freqs[: kwargs["rope_T"], : kwargs["rope_H"], : kwargs["rope_W"]].contiguous()
+            freqs = rearrange(freqs, "t h w d -> (t h w) d")
+            freqs = freqs.unsqueeze(0).unsqueeze(0)
+            return freqs
+
+        freqs_cos = reshape_freq(self.freqs_cos).to(t.dtype)
+        freqs_sin = reshape_freq(self.freqs_sin).to(t.dtype)
 
         return npu_rotary_position_embedding(t, freqs_cos, freqs_sin, mode=1)
 
@@ -560,7 +562,7 @@ class Rotary3DPositionEmbedding(nn.Module):
         else:
             return None
 
-    def forward(self, x):
+    def forward(self, x, **kwargs):
         # input shape: bnsd
-        x[:, :, self.text_length:] = self.rotary(x[:, :, self.text_length:])
+        x[:, :, self.text_length:] = self.rotary(x[:, :, self.text_length:], **kwargs)
         return x
diff --git a/mindspeed_mm/models/common/normalize.py b/mindspeed_mm/models/common/normalize.py
index 3590b73a..15356594 100644
--- a/mindspeed_mm/models/common/normalize.py
+++ b/mindspeed_mm/models/common/normalize.py
@@ -4,6 +4,7 @@ from einops import rearrange
 
 from mindspeed_mm.models.common.communications import _conv_split, _conv_gather
 from mindspeed_mm.models.common.conv import ContextParallelCausalConv3d
+from mindspeed_mm.utils.utils import get_context_parallel_rank
 
 
 class LayerNorm(nn.Module):
@@ -142,15 +143,28 @@ class SpatialNorm3D(nn.Module):
         )
 
     def forward(self, f, zq, clear_fake_cp_cache=True, enable_cp=True):
-        if f.shape[2] > 1 and f.shape[2] % 2 == 1:
+        if f.shape[2] > 1 and get_context_parallel_rank() == 0 and enable_cp:
             f_first, f_rest = f[:, :, :1], f[:, :, 1:]
             f_first_size, f_rest_size = f_first.shape[-3:], f_rest.shape[-3:]
             zq_first, zq_rest = zq[:, :, :1], zq[:, :, 1:]
             zq_first = torch.nn.functional.interpolate(zq_first, size=f_first_size, mode="nearest")
-            zq_rest = torch.nn.functional.interpolate(zq_rest, size=f_rest_size, mode="nearest")
+
+            zq_rest_splits = torch.split(zq_rest, 32, dim=1)
+            interpolated_splits = [
+                torch.nn.functional.interpolate(split, size=f_rest_size, mode="nearest") for split in zq_rest_splits
+            ]
+
+            zq_rest = torch.cat(interpolated_splits, dim=1)
+
             zq = torch.cat([zq_first, zq_rest], dim=2)
         else:
-            zq = torch.nn.functional.interpolate(zq, size=f.shape[-3:], mode="nearest")
+            f_size = f.shape[-3:]
+
+            zq_splits = torch.split(zq, 32, dim=1)
+            interpolated_splits = [
+                torch.nn.functional.interpolate(split, size=f_size, mode="nearest") for split in zq_splits
+            ]
+            zq = torch.cat(interpolated_splits, dim=1)
 
         if self.add_conv:
             zq = self.conv(zq, clear_cache=clear_fake_cp_cache, enable_cp=enable_cp)
diff --git a/mindspeed_mm/models/common/updownsample.py b/mindspeed_mm/models/common/updownsample.py
index 939839d7..92b9c148 100644
--- a/mindspeed_mm/models/common/updownsample.py
+++ b/mindspeed_mm/models/common/updownsample.py
@@ -9,6 +9,7 @@ import torch.nn.functional as F
 
 from mindspeed_mm.utils.utils import cast_tuple, video_to_image
 from mindspeed_mm.models.common.conv import CausalConv3d, WfCausalConv3d
+from mindspeed_mm.utils.utils import get_context_parallel_rank
 
 
 class Upsample(nn.Module):
@@ -79,21 +80,29 @@ class DownSample3D(nn.Module):
             self.conv = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=0)
         self.compress_time = compress_time
 
-    def forward(self, x):
+    def forward(self, x, enable_cp=True):
         if self.compress_time and x.shape[2] > 1:
             h, w = x.shape[-2:]
             x = rearrange(x, "b c t h w -> (b h w) c t")
 
-            if x.shape[-1] % 2 == 1:
+            if get_context_parallel_rank() == 0 and enable_cp:
                 # split first frame
                 x_first, x_rest = x[..., 0], x[..., 1:]
 
                 if x_rest.shape[-1] > 0:
-                    x_rest = torch.nn.functional.avg_pool1d(x_rest, kernel_size=2, stride=2)
+                    splits = torch.split(x_rest, 32, dim=1)
+                    interpolated_splits = [
+                        torch.nn.functional.avg_pool1d(split, kernel_size=2, stride=2) for split in splits
+                    ]
+                    x_rest = torch.cat(interpolated_splits, dim=1)
                 x = torch.cat([x_first[..., None], x_rest], dim=-1)
                 x = rearrange(x, "(b h w) c t -> b c t h w", h=h, w=w)
             else:
-                x = torch.nn.functional.avg_pool1d(x, kernel_size=2, stride=2)
+                splits = torch.split(x, 32, dim=1)
+                interpolated_splits = [
+                    torch.nn.functional.avg_pool1d(split, kernel_size=2, stride=2) for split in splits
+                ]
+                x = torch.cat(interpolated_splits, dim=1)
                 x = rearrange(x, "(b h w) c t -> b c t h w", h=h, w=w)
 
         if self.with_conv:
@@ -119,24 +128,39 @@ class Upsample3D(nn.Module):
             self.conv = torch.nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
         self.compress_time = compress_time
 
-    def forward(self, x):
-        if self.compress_time:
-            if x.shape[2] > 1:
+    def forward(self, x, enable_cp=True):
+        if self.compress_time and x.shape[2] > 1:
+            if get_context_parallel_rank() == 0 and enable_cp:
                 # split first frame
                 x_first, x_rest = x[:, :, 0], x[:, :, 1:]
 
                 x_first = torch.nn.functional.interpolate(x_first, scale_factor=2.0, mode="nearest")
                 x_rest = torch.nn.functional.interpolate(x_rest, scale_factor=2.0, mode="nearest")
+
+                splits = torch.split(x_rest, 32, dim=1)
+                interpolated_splits = [
+                    torch.nn.functional.interpolate(split, scale_factor=2.0, mode="nearest") for split in splits
+                ]
+                x_rest = torch.cat(interpolated_splits, dim=1)
+
                 x = torch.cat([x_first[:, :, None, :, :], x_rest], dim=2)
             else:
-                x = x.squeeze(2)
-                x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-                x = x[:, :, None, :, :]
+                splits = torch.split(x, 32, dim=1)
+                interpolated_splits = [
+                    torch.nn.functional.interpolate(split, scale_factor=2.0, mode="nearest") for split in splits
+                ]
+                x = torch.cat(interpolated_splits, dim=1)
         else:
             # only interpolate 2D
             t = x.shape[2]
             x = rearrange(x, "b c t h w -> (b t) c h w")
-            x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+
+            splits = torch.split(x, 32, dim=1)
+            interpolated_splits = [
+                torch.nn.functional.interpolate(split, scale_factor=2.0, mode="nearest") for split in splits
+            ]
+            x = torch.cat(interpolated_splits, dim=1)
+
             x = rearrange(x, "(b t) c h w -> b c t h w", t=t)
 
         if self.with_conv:
diff --git a/mindspeed_mm/models/diffusion/cogvideo_diffusion.py b/mindspeed_mm/models/diffusion/cogvideo_diffusion.py
index 553a211f..4086bb71 100644
--- a/mindspeed_mm/models/diffusion/cogvideo_diffusion.py
+++ b/mindspeed_mm/models/diffusion/cogvideo_diffusion.py
@@ -30,25 +30,30 @@ def append_zero(x):
 
 
 class DiscreteSampling:
-    def __init__(self, discretization_config, num_idx, do_append_zero=False, flip=True, uniform_sampling=False):
+    def __init__(self, discretization_config, num_idx, do_append_zero=False, flip=True, uniform_sampling=False,
+                 group_num=None):
         self.num_idx = num_idx
         self.sigmas = ZeroSNRDDPMDiscretization(**discretization_config)(num_idx, do_append_zero=do_append_zero, flip=flip)
         world_size = mpu.get_data_parallel_world_size()
         self.uniform_sampling = uniform_sampling
-        if self.uniform_sampling:
-            i = 1
-            while True:
-                if world_size % i != 0 or num_idx % (world_size // i) != 0:
-                    i += 1
-                else:
-                    self.group_num = world_size // i
-                    break
+        if group_num:
+            self.group_num = group_num
+        else:
+            if self.uniform_sampling:
+                i = 1
+                while True:
+                    if world_size % i != 0 or num_idx % (world_size // i) != 0:
+                        i += 1
+                    else:
+                        self.group_num = world_size // i
+                        break
 
+        if self.uniform_sampling:
             if self.group_num <= 0:
-                raise ValueError
+                raise ValueError("group_num should not be less than or equal to 0")
 
             if world_size % self.group_num != 0:
-                raise ValueError
+                raise ValueError("The remainder of world_size to group_num should be equal to 0")
 
             self.group_width = world_size // self.group_num  # the number of rank in one group
             self.sigma_interval = self.num_idx // self.group_num
@@ -91,6 +96,8 @@ def make_beta_schedule(
 ):
     if schedule == "linear":
         betas = torch.linspace(linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64) ** 2
+    else:
+        raise NotImplementedError("Only support linear schedule")
     return betas.numpy()
 
 
@@ -301,17 +308,21 @@ class CogVideoDiffusion(nn.Module):
         self.latents = latents
         self.x_start = noised_input
 
-        return noised_input * self.c_in, self.c_noise, idx
+        kwargs["model_kwargs"]["c_out"] = self.c_out
+        kwargs["model_kwargs"]["noised_start"] = self.x_start * self.c_skip
+        kwargs["model_kwargs"]["alphas_cumprod"] = self.alphas_cumprod_sqrt
+
+        return noised_input * self.c_in, noise, idx
 
-    def training_losses(self, model_output, **kwargs):
-        model_output = model_output * self.c_out + self.x_start * self.c_skip
+    def training_losses(self, model_output, x_start, **kwargs):
+        model_output = model_output * kwargs['c_out'] + kwargs["noised_start"]
 
-        w = append_dims(1 / (1 - self.alphas_cumprod_sqrt ** 2), self.x_start.ndim)  # v-pred
+        w = append_dims(1 / (1 - kwargs['alphas_cumprod'] ** 2), x_start.ndim)  # v-pred
 
         if self.min_snr_value is not None:
             w = min(w, self.min_snr_value)
 
-        return self.get_loss(model_output, self.latents, w)
+        return self.get_loss(model_output, x_start, w)
 
     def get_loss(self, model_output, target, w):
         model_output = model_output.transpose(1, 2)
diff --git a/mindspeed_mm/models/diffusion/diffusers_scheduler.py b/mindspeed_mm/models/diffusion/diffusers_scheduler.py
index a3ab7916..8b372235 100644
--- a/mindspeed_mm/models/diffusion/diffusers_scheduler.py
+++ b/mindspeed_mm/models/diffusion/diffusers_scheduler.py
@@ -245,6 +245,9 @@ class DiffusersScheduler:
 
                 with torch.no_grad():
                     noise_pred = model(timestep=current_timestep, **model_kwargs)
+                
+                if isinstance(noise_pred, tuple) or isinstance(noise_pred, list):
+                    noise_pred = noise_pred[0]
 
                 # perform guidance
                 if use_dynamic_cfg:
diff --git a/mindspeed_mm/models/predictor/dits/sat_dit.py b/mindspeed_mm/models/predictor/dits/sat_dit.py
index 5d7f382e..1da75f58 100644
--- a/mindspeed_mm/models/predictor/dits/sat_dit.py
+++ b/mindspeed_mm/models/predictor/dits/sat_dit.py
@@ -1,21 +1,22 @@
-from curses import KEY_A1
+from functools import reduce
+from operator import mul
 from typing import Optional, Tuple, Dict
+from contextlib import nullcontext
 
-from einops import rearrange, repeat
 import torch
-from torch import nn
-import torch.nn.functional as F
 from diffusers.models.embeddings import SinusoidalPositionalEmbedding
+from einops import rearrange
 from megatron.core import mpu, tensor_parallel
 from megatron.training import get_args
 from megatron.training.arguments import core_transformer_config_from_args
+from torch import nn
 
 from mindspeed_mm.models.common.ffn import FeedForward as TensorParallelFeedForward
 from mindspeed_mm.models.common.communications import split_forward_gather_backward, gather_forward_split_backward
 from mindspeed_mm.models.common.embeddings.pos_embeddings import Rotary3DPositionEmbedding
 from mindspeed_mm.models.common.embeddings.time_embeddings import TimeStepEmbedding
 from mindspeed_mm.models.common.module import MultiModalModule
-from mindspeed_mm.models.common.embeddings.patch_embeddings import VideoPatchEmbed2D, VideoPatch2D
+from mindspeed_mm.models.common.embeddings.patch_embeddings import VideoPatchEmbed2D, VideoPatch2D, VideoPatch3D
 from mindspeed_mm.models.common.attention import SelfAttentionBNSD, ParallelSelfAttentionSBH
 
 
@@ -61,6 +62,7 @@ class SatDiT(MultiModalModule):
         attention_bias: bool = False,
         input_size: Tuple[int] = None,
         patch_size: Tuple[int] = None,
+        patch_type: str = "2D",
         activation_fn: str = "geglu",
         norm_type: str = "layer_norm",
         num_embeds_ada_norm: Optional[int] = None,
@@ -70,14 +72,20 @@ class SatDiT(MultiModalModule):
         use_rope: bool = False,
         interpolation_scale: Tuple[float] = None,
         elementwise_affine: bool = True,
-        text_length=None,
-        text_hidden_size=None,
-        time_embed_dim=None,
-        concat_text_embed=None,
-        learnable_pos_embed=False,
+        text_length: int = None,
+        text_hidden_size: int = None,
+        time_embed_dim: int = None,
+        concat_text_embed: bool = None,
+        learnable_pos_embed: bool = False,
+        pre_process: bool = True,
+        post_process: bool = True,
+        global_layer_idx: Optional[Tuple] = None,
         **kwargs
     ):
         super().__init__(config=None)
+        self.pre_process = pre_process
+        self.post_process = post_process
+        self.input_size = input_size
         # Validate inputs and init args.
         if patch_size is not None:
             if norm_type not in ["ada_norm", "ada_norm_zero", "ada_norm_single", "qk_ln"]:
@@ -88,28 +96,25 @@ class SatDiT(MultiModalModule):
                 raise ValueError(
                     f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
                 )
+        self.patch_size = patch_size
+        self.patch_type = patch_type
         self.patch_size_t, self.patch_size_h, self.patch_size_w = patch_size
         self.norm_type = norm_type
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.num_layers = num_layers
         self.concat_text_embed = concat_text_embed
-        t, h, w = input_size
-        seq_len = text_length + t // self.patch_size_t * h // self.patch_size_h * w // self.patch_size_w
-        seq_begin = (seq_len // mpu.get_context_parallel_world_size()) * mpu.get_context_parallel_rank()
-        seq_end = (seq_len // mpu.get_context_parallel_world_size()) * (mpu.get_context_parallel_rank() + 1)
-        if seq_end < text_length:
-            self.text_length = seq_len // mpu.get_context_parallel_world_size()
-        elif seq_begin > text_length:
-            self.text_length = 0
-        else:
-            self.text_length = text_length - seq_begin
+        args = get_args()
+        self.sequence_parallel = args.sequence_parallel
+        self.text_length = self._get_text_length(input_size, text_length)
+        self.ori_text_length = text_length
+        self.seq_len = text_length + reduce(mul, input_size) // reduce(mul, patch_size)
         self.text_hidden_size = text_hidden_size
         self.elementwise_affine = elementwise_affine
         inner_dim = num_heads * head_dim
+        self.inner_dim = inner_dim
         self.time_embed_dim = time_embed_dim if time_embed_dim is not None else inner_dim
 
-        args = get_args()
         self.recompute_granularity = args.recompute_granularity
         self.distribute_saved_activations = args.distribute_saved_activations
         self.recompute_method = args.recompute_method
@@ -125,9 +130,22 @@ class SatDiT(MultiModalModule):
             self.enable_sequence_parallelism = False
 
         # Initialize blocks
+
+        if self.pre_process:
         # Init PatchEmbed
-        self.time_embed = TimeStepEmbedding(inner_dim, self.time_embed_dim)
-        self.patch_embed = VideoPatch2D(in_channels, inner_dim, self.patch_size_h)
+            self.time_embed = TimeStepEmbedding(inner_dim, self.time_embed_dim)
+            if self.patch_type == "3D":
+                self.patch_embed = VideoPatch3D(in_channels, inner_dim, self.patch_size)
+            else:
+                self.patch_embed = VideoPatch2D(in_channels, inner_dim, self.patch_size_h)
+            
+            # Init Projection
+            self.caption_projection = None
+            if text_hidden_size is not None:
+                self.caption_projection = nn.Linear(self.text_hidden_size, inner_dim)
+        
+        self.global_layer_idx = global_layer_idx if global_layer_idx is not None else tuple(range(num_layers))
+
         self.pos_embed = Rotary3DPositionEmbedding(
             hidden_size_head=head_dim,
             text_length=text_length,
@@ -158,22 +176,52 @@ class SatDiT(MultiModalModule):
                     enable_sequence_parallelism=self.enable_sequence_parallelism,
                     time_embed_dim=self.time_embed_dim,
                     text_length=self.text_length,
+                    patch_size=self.patch_size
                 )
                 for i in range(num_layers)
             ]
         )
-        # Init Norm
-        self.norm_final = nn.LayerNorm(inner_dim, elementwise_affine=elementwise_affine, eps=1e-5)
-        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(self.time_embed_dim, 2 * inner_dim, bias=True))
-        self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=elementwise_affine, eps=1e-6)
-        self.proj_out = nn.Linear(inner_dim,
-                                  self.patch_size_t * self.patch_size_h * self.patch_size_w * self.out_channels)
-        # Init Projection
-        self.caption_projection = None
-        if text_hidden_size is not None:
-            self.caption_projection = nn.Linear(self.text_hidden_size, inner_dim)
+
+        if self.post_process:
+            # Init Norm
+            self.norm_final = nn.LayerNorm(inner_dim, elementwise_affine=elementwise_affine, eps=1e-5)
+            config = core_transformer_config_from_args(args)
+            config.sequence_parallel = False
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(),
+                tensor_parallel.ColumnParallelLinear(
+                    self.time_embed_dim,
+                    2 * inner_dim,
+                    config=config,
+                    init_method=config.init_method,
+                    gather_output=False
+                )
+            )
+            self.norm_out = nn.LayerNorm(inner_dim, elementwise_affine=elementwise_affine, eps=1e-6)
+            self.proj_out = nn.Linear(inner_dim, reduce(mul, self.patch_size) * self.out_channels)
+
+            for param in self.norm_final.parameters():
+                setattr(param, "sequence_parallel", self.sequence_parallel)
+            for param in self.norm_out.parameters():
+                setattr(param, "sequence_parallel", self.sequence_parallel)
+        
         print(self)
 
+    def _get_text_length(self, input_size, text_length):
+        t, h, w = input_size
+        cp = mpu.get_context_parallel_world_size()
+        tp_sp = mpu.get_tensor_model_parallel_world_size() if self.sequence_parallel else 1
+        tp_sp_rank = mpu.get_tensor_model_parallel_rank() if self.sequence_parallel else 0
+        seq_len = text_length + t * h * w // self.patch_size_t // self.patch_size_h // self.patch_size_w
+        seq_begin = (seq_len // cp // tp_sp) * (mpu.get_context_parallel_rank() * tp_sp + tp_sp_rank)
+        seq_end = (seq_len // cp // tp_sp) + seq_begin
+        if seq_end < text_length:
+            return seq_len // cp // tp_sp
+        elif seq_begin > text_length:
+            return 0
+        else:
+            return text_length - seq_begin
+
     def forward(
         self,
         latents: torch.Tensor,
@@ -197,51 +245,53 @@ class SatDiT(MultiModalModule):
             class_labels: Used to indicate class labels conditioning.
             use_image_num: The number of images use for trainning.
         """
-        b, _, t, _, _ = latents.shape
+        b = latents.shape[0]
+        t, h, w = self.input_size[0], self.input_size[1], self.input_size[2]
         frames = t - use_image_num
         vid_mask, img_mask = None, None
         prompt_vid_mask, prompt_img_mask = None, None
 
+        # RNG context
+        if self.sequence_parallel:
+            rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
+        else:
+            rng_context = nullcontext()
+
         # 1. Input
         frames = ((frames - 1) // self.patch_size_t + 1) if frames % 2 == 1 else frames // self.patch_size_t  # patchfy
-        height, width = latents.shape[-2] // self.patch_size_h, latents.shape[-1] // self.patch_size_w
-
-        if "masked_video" in kwargs.keys() and kwargs["masked_video"] is not None:
-            latents = torch.cat([latents, kwargs["masked_video"]], dim=1)
-        
-        added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
-        latents_vid, latents_img, prompt_vid, prompt_img, timestep_vid, timestep_img, \
-            embedded_timestep_vid, embedded_timestep_img = self._operate_on_patched_inputs(
-            latents, prompt, timestep, added_cond_kwargs, b, frames, use_image_num
-        )
-        if self.concat_text_embed:
-            latents_vid = torch.cat((prompt_vid, latents_vid), dim=1)
-
-        if self.enable_sequence_parallelism and latents_vid is not None and prompt_vid is not None:
-            latents_vid = rearrange(latents_vid, 'b s h -> s b h', b=b).contiguous()
-            latents_vid = split_forward_gather_backward(latents_vid, mpu.get_context_parallel_group(), dim=0,
-                                                        grad_scale='down')
+        height, width = h // self.patch_size_h, w // self.patch_size_w
+
+        if self.pre_process:
+            if "masked_video" in kwargs.keys() and kwargs["masked_video"] is not None:
+                latents = torch.cat([latents, kwargs["masked_video"]], dim=1)
+
+            added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
+            latents_vid, latents_img, prompt_vid, prompt_img, timestep_vid, timestep_img, \
+                embedded_timestep_vid, embedded_timestep_img = self._operate_on_patched_inputs(
+                latents, prompt, timestep, frames)
+            if self.concat_text_embed:
+                latents_vid = torch.cat((prompt_vid, latents_vid), dim=1)
+
+            if self.enable_sequence_parallelism or self.sequence_parallel:
+                latents_vid = latents_vid.transpose(0, 1).contiguous()
+            if self.enable_sequence_parallelism:
+                latents_vid = split_forward_gather_backward(latents_vid, mpu.get_context_parallel_group(), dim=0,
+                                                            grad_scale='down')
+            if self.sequence_parallel:
+                latents_vid = tensor_parallel.scatter_to_sequence_parallel_region(latents_vid)
+        else:
+            latents_vid = latents
+            prompt_vid = prompt
+            timestep_vid = timestep
 
         frames = torch.tensor(frames)
         height = torch.tensor(height)
         width = torch.tensor(width)
-        if self.recompute_granularity == "full":
-            if latents_vid is not None:
-                latents_vid = self._checkpointed_forward(
-                    latents_vid,
-                    video_mask=vid_mask,
-                    prompt=prompt_vid,
-                    prompt_mask=prompt_vid_mask,
-                    timestep=timestep_vid,
-                    class_labels=class_labels,
-                    frames=frames,
-                    height=height,
-                    width=width
-                )
-        else:
-            for block in self.videodit_blocks:
+
+        with rng_context:
+            if self.recompute_granularity == "full":
                 if latents_vid is not None:
-                    latents_vid = block(
+                    latents_vid = self._checkpointed_forward(
                         latents_vid,
                         video_mask=vid_mask,
                         prompt=prompt_vid,
@@ -250,34 +300,50 @@ class SatDiT(MultiModalModule):
                         class_labels=class_labels,
                         frames=frames,
                         height=height,
-                        width=width
+                        width=width,
+                        rope_T=t,
+                        rope_H=h,
+                        rope_W=w
                     )
+            else:
+                for block in self.videodit_blocks:
+                    if latents_vid is not None:
+                        latents_vid = block(
+                            latents_vid,
+                            video_mask=vid_mask,
+                            prompt=prompt_vid,
+                            prompt_mask=prompt_vid_mask,
+                            timestep=timestep_vid,
+                            class_labels=class_labels,
+                            frames=frames,
+                            height=height,
+                            width=width,
+                            rope_T=torch.tensor(t / self.patch_size[0], dtype=torch.int),
+                            rope_H=torch.tensor(h / self.patch_size[1], dtype=torch.int),
+                            rope_W=torch.tensor(w / self.patch_size[2], dtype=torch.int)
+                        )
 
-        if self.enable_sequence_parallelism and latents_vid is not None:
-            latents_vid = rearrange(latents_vid, 's b h -> b s h', b=b).contiguous()
-            latents_vid = gather_forward_split_backward(latents_vid, mpu.get_context_parallel_group(), dim=1,
-                                                        grad_scale='up')
 
         # 3. Output
-        output_vid, output_img = None, None
-        if latents_vid is not None:
-            output_vid = self._get_output_for_patched_inputs(
-                latents=latents_vid,
-                timestep=timestep_vid,
-                class_labels=class_labels,
-                embedded_timestep=embedded_timestep_vid,
-                num_frames=frames,
-                height=height,
-                width=width,
-            )  # [b, c, t, h, w]
-
-        if output_vid is not None and output_img is not None:
-            output = torch.cat([output_vid, output_img], dim=2)
-        elif output_vid is not None:
-            output = output_vid
-        elif output_img is not None:
-            output = output_img
-        return output
+        if self.post_process:
+            output_vid, output_img = None, None
+            if latents_vid is not None:
+                output_vid = self._get_output_for_patched_inputs(
+                    latents=latents_vid,
+                    timestep=timestep_vid,
+                    height=height,
+                    width=width,
+                )  # [b, c, t, h, w]
+
+            if output_vid is not None and output_img is not None:
+                output = torch.cat([output_vid, output_img], dim=2)
+            elif output_vid is not None:
+                output = output_vid
+            elif output_img is not None:
+                output = output_img
+            return output, prompt_vid, timestep_vid
+        else:
+            return latents_vid, prompt_vid, timestep_vid
 
     def _get_block(self, layer_number):
         return self.videodit_blocks[layer_number]
@@ -292,7 +358,9 @@ class SatDiT(MultiModalModule):
         class_labels,
         frames,
         height,
-        width):
+        width,
+        **kwargs
+    ):
         """Forward method with activation checkpointing."""
 
         def custom(start, end):
@@ -321,7 +389,10 @@ class SatDiT(MultiModalModule):
                     class_labels,
                     frames,
                     height,
-                    width
+                    width,
+                    torch.tensor(kwargs["rope_T"] / self.patch_size[0], dtype=torch.int),
+                    torch.tensor(kwargs["rope_H"] / self.patch_size[1], dtype=torch.int),
+                    torch.tensor(kwargs["rope_W"] / self.patch_size[2], dtype=torch.int)
                 )
                 layer_num += self.recompute_num_layers
         elif self.recompute_method == "block":
@@ -338,7 +409,10 @@ class SatDiT(MultiModalModule):
                         class_labels,
                         frames,
                         height,
-                        width
+                        width,
+                        torch.tensor(kwargs["rope_T"] / self.patch_size[0], dtype=torch.int),
+                        torch.tensor(kwargs["rope_H"] / self.patch_size[1], dtype=torch.int),
+                        torch.tensor(kwargs["rope_W"] / self.patch_size[2], dtype=torch.int)
                     )
                 else:
                     block = self._get_block(layer_num)
@@ -351,7 +425,10 @@ class SatDiT(MultiModalModule):
                         class_labels=class_labels,
                         frames=frames,
                         height=height,
-                        width=width
+                        width=width,
+                        rope_T=kwargs["rope_T"],
+                        rope_H=kwargs["rope_H"],
+                        rope_W=kwargs["rope_W"]
                     )
         else:
             raise ValueError("Invalid activation recompute method.")
@@ -367,17 +444,23 @@ class SatDiT(MultiModalModule):
             buffers = tuple(self.buffers())
             return buffers[0].dtype
 
-    def _operate_on_patched_inputs(self, latents, prompt, timestep, added_cond_kwargs, batch_size, frames,
-                                   use_image_num):
+    def _operate_on_patched_inputs(self, latents, prompt, timestep, frames):
+        b, _, t, h, w = latents.shape
         if self.pos_embed is not None:
-            latents_vid, latents_img = self.patch_embed(latents.to(self.dtype), prompt)
+            latents_vid, latents_img = self.patch_embed(latents.to(self.dtype), prompt,
+                                                        rope_T=t // self.patch_size[0],
+                                                        rope_H=h // self.patch_size[1],
+                                                        rope_W=w // self.patch_size[2])
             _, seq_len, _ = latents_vid.shape
             pos_emb = self.pos_embed.position_embedding_forward(latents.to(self.dtype),
                                                                 seq_length=seq_len - self.text_length)
             if pos_emb is not None:
                 latents_vid = latents_vid + pos_emb
         else:
-            latents_vid, latents_img = self.patch_embed(latents.to(self.dtype), frames)
+            latents_vid, latents_img = self.patch_embed(latents.to(self.dtype), frames,
+                                                        rope_T=t // self.patch_size[0],
+                                                        rope_H=h // self.patch_size[1],
+                                                        rope_W=w // self.patch_size[2])
         timestep_vid, timestep_img = None, None
         embedded_timestep_vid, embedded_timestep_img = None, None
         prompt_vid, prompt_img = None, None
@@ -395,12 +478,29 @@ class SatDiT(MultiModalModule):
 
         return latents_vid, latents_img, prompt_vid, prompt_img, timestep_vid, timestep_img, embedded_timestep_vid, embedded_timestep_img
 
-    def _get_output_for_patched_inputs(self, latents, timestep, class_labels, embedded_timestep, num_frames,
-                                       height=None, width=None):
+    def _get_output_for_patched_inputs(self, latents, timestep, height=None, width=None):
         x = self.norm_final(latents)
+        _scale_shift_table = self.adaLN_modulation(timestep)[0]
+        if self.sequence_parallel:
+            _scale_shift_table = tensor_parallel.mappings.all_gather_last_dim_from_tensor_parallel_region(
+                _scale_shift_table
+            )
+        else:
+            _scale_shift_table = tensor_parallel.mappings.gather_from_tensor_model_parallel_region(
+                _scale_shift_table
+            )
+        if self.sequence_parallel or self.enable_sequence_parallelism:
+            shift, scale = _scale_shift_table.unsqueeze(0).chunk(2, dim=2)
+        else:
+            shift, scale = _scale_shift_table.unsqueeze(1).chunk(2, dim=2)
+        x = self.norm_out(x) * (1 + scale) + shift
+        if self.sequence_parallel:
+            x = tensor_parallel.gather_from_sequence_parallel_region(x, tensor_parallel_output_grad=False)
+        if self.sequence_parallel or self.enable_sequence_parallelism:
+            x = x.transpose(0, 1).contiguous()
+        if self.enable_sequence_parallelism:
+            x = gather_forward_split_backward(x, mpu.get_context_parallel_group(), dim=1, grad_scale="up")
         x = x[:, self.pos_embed.text_length:, :]
-        shift, scale = self.adaLN_modulation(timestep).chunk(2, dim=1)
-        x = self.norm_out(x) * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
         x = self.proj_out(x)
         latents = x
 
@@ -411,6 +511,63 @@ class SatDiT(MultiModalModule):
                            c=self.out_channels).transpose(1, 2)
         return output
 
+    def initialize_pipeline_tensor_shapes(self):
+        args = get_args()
+        micro_batch_size = args.mm.data.dataloader_param.batch_size
+        dtype = args.params_dtype
+        latent_size = (self.out_channels, *self.input_size)
+        seq_len = self.seq_len 
+
+        if self.enable_sequence_parallelism or self.sequence_parallel:
+            prev_output_shape = (seq_len // mpu.get_context_parallel_world_size(), micro_batch_size, self.inner_dim) # SBH
+        else:
+            prev_output_shape = (micro_batch_size, seq_len, self.inner_dim) # BSH
+
+        pipeline_tensor_shapes = [
+            {'shape': prev_output_shape, 'dtype': dtype},                                           # prev_stage_output
+            {'shape': (micro_batch_size, *latent_size), 'dtype': dtype},                            # latents 
+            {'shape': (micro_batch_size, self.ori_text_length, self.inner_dim), 'dtype': dtype},    # prompt
+            {'shape': (micro_batch_size, self.time_embed_dim), 'dtype': dtype},                     # embedded_timestep
+            {'shape': (micro_batch_size, *latent_size), 'dtype': torch.float32},                    # video_diffusion: self.noised_start
+            {'shape': (micro_batch_size, 1, 1, 1, 1), 'dtype': torch.float32},                      # video_diffusion: self.c_out
+            {'shape': (micro_batch_size), 'dtype': torch.float32},                                  # video_diffusion: self.alpha_cumprod
+        ]
+        return pipeline_tensor_shapes
+
+    def pipeline_set_prev_stage_tensor(self, input_tensor_list, extra_kwargs=None):
+        """
+        Process tensor from prev_pipeline_stage, and adjust to predictor input and training loss input.
+        Input: 
+            input_tensor_list: 
+                model_output, latents, predictor_prompt, predictor_timesteps,
+            extra_kwargs (extra parameter for video_diffusion): 
+                extra_kwargs["noised_start"], extra_kwargs["c_out"], extra_kwargs["alphas_cumprod"]
+        Return:
+            predictor_input_list (input for self.predictor in SoraModel.forward):
+                predictor_input_latent, predictor_timesteps, predictor_prompt, predictor_video_mask, prompt_prompt_mask
+            training_loss_input_list (input for self.compute_loss in SoraModel.forward):
+                latents, noised_latents, timesteps, noise, video_mask
+        """
+        predictor_input_latent, latents, predictor_prompt, predictor_timesteps, \
+            extra_kwargs["noised_start"], extra_kwargs["c_out"], extra_kwargs["alphas_cumprod"] = input_tensor_list
+        predictor_video_mask, predictor_prompt_mask = None, None
+        predictor_input_list = [predictor_input_latent, predictor_timesteps, predictor_prompt, predictor_video_mask, predictor_prompt_mask]
+        training_loss_input_list = [latents, None, predictor_timesteps, None, predictor_video_mask]
+    
+        return predictor_input_list, training_loss_input_list
+
+    def pipeline_set_next_stage_tensor(self, input_list, output_list, extra_kwargs=None):
+        """
+        Process predictor output tensors from curr pipeline stage, and adjust to next pipeline stage 
+        Input:
+            input_list: [latents, noised_latents, timesteps, noise, video_mask]
+            output_list: [predictor_output, predictor_prompt, predictor_timesteps]
+        Return:
+            predictor_output, latents, predictor_prompt, predictor_timesteps, extra_kwargs["noised_start"], extra_kwargs["c_out"], extra_kwargs["alphas_cumprod"]
+        """
+        return [output_list[0], input_list[0], output_list[1], output_list[2], \
+                extra_kwargs["noised_start"], extra_kwargs["c_out"], extra_kwargs["alphas_cumprod"]]
+
 
 class VideoDiTBlock(nn.Module):
     """
@@ -462,8 +619,12 @@ class VideoDiTBlock(nn.Module):
         time_embed_dim=None,
         text_length=None,
         pos_embed=None,
+        patch_size=None
     ):
         super().__init__()
+        self.patch_size = patch_size
+        args = get_args()
+        self.sequence_parallel = args.sequence_parallel
         self.time_embed_dim = time_embed_dim if time_embed_dim is not None else dim
         self.cross_attention_dim = cross_attention_dim
         if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
@@ -487,7 +648,7 @@ class VideoDiTBlock(nn.Module):
         self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
 
         self.enable_sequence_parallelism = enable_sequence_parallelism
-        if self.enable_sequence_parallelism:
+        if self.enable_sequence_parallelism or self.sequence_parallel:
             attention = ParallelSelfAttentionSBH
         else:
             attention = SelfAttentionBNSD
@@ -520,8 +681,8 @@ class VideoDiTBlock(nn.Module):
         )
 
         # 3. Scale-shift.
-        args = get_args()
         config = core_transformer_config_from_args(args)
+        config.sequence_parallel = False
         self.scale_shift_table = nn.Sequential(
             nn.SiLU(),
             tensor_parallel.ColumnParallelLinear(
@@ -529,9 +690,13 @@ class VideoDiTBlock(nn.Module):
                 12 * dim,
                 config=config,
                 init_method=config.init_method,
-                gather_output=True
+                gather_output=False
             )
         )
+        for param in self.norm1.parameters():
+            setattr(param, "sequence_parallel", self.sequence_parallel)
+        for param in self.norm2.parameters():
+            setattr(param, "sequence_parallel", self.sequence_parallel)
 
         # let chunk size default to None
         self._chunk_size = None
@@ -548,13 +713,25 @@ class VideoDiTBlock(nn.Module):
         frames: torch.int64 = None,
         height: torch.int64 = None,
         width: torch.int64 = None,
+        rope_T: torch.int64 = None,
+        rope_H: torch.int64 = None,
+        rope_W: torch.int64 = None,
         added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
     ) -> torch.FloatTensor:
         # 1. Self-Attention
         frames = frames.item()
         height = height.item()
         width = width.item()
-        if self.enable_sequence_parallelism:
+        if self.enable_sequence_parallelism or self.sequence_parallel:
+            _scale_shift_table = self.scale_shift_table(timestep)[0]
+            if self.sequence_parallel:
+                _scale_shift_table = tensor_parallel.mappings.all_gather_last_dim_from_tensor_parallel_region(
+                    _scale_shift_table
+                )
+            else:
+                _scale_shift_table = tensor_parallel.mappings.gather_from_tensor_model_parallel_region(
+                    _scale_shift_table
+                )
             (
                 shift_msa,
                 scale_msa,
@@ -568,10 +745,14 @@ class VideoDiTBlock(nn.Module):
                 text_shift_mlp,
                 text_scale_mlp,
                 text_gate_mlp,
-            ) = self.scale_shift_table(timestep)[0].unsqueeze(0).chunk(12, dim=2)
+            ) = _scale_shift_table.unsqueeze(0).chunk(12, dim=2)
             latents_text = latents[:self.text_length]
             latents_vid = latents[self.text_length:]
         else:
+            _scale_shift_table = self.scale_shift_table(timestep)[0]
+            _scale_shift_table = tensor_parallel.mappings.gather_from_tensor_model_parallel_region(
+                _scale_shift_table
+            )
             (
                 shift_msa,
                 scale_msa,
@@ -585,20 +766,20 @@ class VideoDiTBlock(nn.Module):
                 text_shift_mlp,
                 text_scale_mlp,
                 text_gate_mlp,
-            ) = self.scale_shift_table(timestep)[0].unsqueeze(1).chunk(12, dim=2)
+            ) = _scale_shift_table.unsqueeze(1).chunk(12, dim=2)
             latents_text = latents[:, :self.text_length]
             latents_vid = latents[:, self.text_length:]
         latents_vid = self.norm1(latents_vid)
         latents_text = self.norm1(latents_text)
         latents_vid = latents_vid * (1 + scale_msa) + shift_msa
         latents_text = latents_text * (1 + text_scale_msa) + text_shift_msa
-        if self.enable_sequence_parallelism:
+        if self.enable_sequence_parallelism or self.sequence_parallel:
             norm_latents = torch.cat((latents_text, latents_vid), dim=0)  # (s_t + t * h/2 * w/2, b, n * d)
         else:
             norm_latents = torch.cat((latents_text, latents_vid), dim=1)  # (b, s_t + t * h/2 * w/2, n * d)
 
         if self.pos_embed is not None and self.positional_embeddings is not None:
-            norm_latents = self.pos_embed(norm_latents)
+            norm_latents = self.pos_embed(norm_latents, rope_T=rope_T, rope_H=rope_H, rope_W=rope_W)
 
         attn_output = self.self_atten(
             query=norm_latents,
@@ -607,8 +788,11 @@ class VideoDiTBlock(nn.Module):
             frames=frames,
             height=height,
             width=width,
+            rope_T=rope_T,
+            rope_H=rope_H,
+            rope_W=rope_W
         )
-        if self.enable_sequence_parallelism:
+        if self.enable_sequence_parallelism or self.sequence_parallel:
             attn_vid_output = gate_msa * attn_output[self.text_length:]
             attn_text_output = text_gate_msa * attn_output[:self.text_length]
             attn_output = torch.cat((attn_text_output, attn_vid_output), dim=0)
@@ -620,7 +804,7 @@ class VideoDiTBlock(nn.Module):
         latents = attn_output + latents
 
         # 2. Feed-forward
-        if self.enable_sequence_parallelism:
+        if self.enable_sequence_parallelism or self.sequence_parallel:
             latents_text = latents[:self.text_length]
             latents_vid = latents[self.text_length:]
             latents_text = self.norm2(latents_text)
@@ -639,7 +823,7 @@ class VideoDiTBlock(nn.Module):
 
         ff_output = self.ff(norm_latents)
 
-        if self.enable_sequence_parallelism:
+        if self.enable_sequence_parallelism or self.sequence_parallel:
             ff_vid_output = gate_mlp * ff_output[self.text_length:]
             ff_text_output = text_gate_mlp * ff_output[:self.text_length]
             ff_output = torch.cat((ff_text_output, ff_vid_output), dim=0)
diff --git a/mindspeed_mm/models/predictor/predict_model.py b/mindspeed_mm/models/predictor/predict_model.py
index 83fc7724..f263b62e 100644
--- a/mindspeed_mm/models/predictor/predict_model.py
+++ b/mindspeed_mm/models/predictor/predict_model.py
@@ -1,5 +1,6 @@
 from torch import nn
 from megatron.training.utils import print_rank_0
+from megatron.core import mpu
 
 from mindspeed_mm.models.common.checkpoint import load_checkpoint
 from .dits import VideoDiT, Latte, STDiT, STDiT3, VideoDitSparse, SatDiT, VideoDitSparseI2V, PTDiT
@@ -28,6 +29,7 @@ class PredictModel(nn.Module):
     def __init__(self, config):
         super().__init__()
         model_cls = PREDICTOR_MODEL_MAPPINGS[config.model_id]
+        config = self._build_predictor_layers_config(config)
         self.predictor = model_cls(**config.to_dict())
         if config.from_pretrained is not None:
             load_checkpoint(self.predictor, config.from_pretrained)
@@ -35,3 +37,29 @@ class PredictModel(nn.Module):
 
     def get_model(self):
         return self.predictor
+
+    def _build_predictor_layers_config(self, config):
+        if mpu.get_pipeline_model_parallel_world_size() <= 1:
+            return config
+        
+        pp_rank = mpu.get_pipeline_model_parallel_rank()
+
+        if not hasattr(config, "pipeline_num_layers"):
+            raise ValueError(f"The `pipeline_num_layers` must be specified in the config for pipeline parallel")
+        if sum(config.pipeline_num_layers) != config.num_layers:
+            raise ValueError(f"The sum of `pipeline_num_layers` must be equal to the `num_layers`")
+        
+        local_num_layers = config.pipeline_num_layers[pp_rank]
+        if local_num_layers <= 0:
+            raise ValueError(f"for pp_rank {pp_rank}, the predictor layer is {local_num_layers}, "
+                             f"which is invalid. ")
+
+        pipeline_start_idx = sum(config.pipeline_num_layers[:pp_rank])
+        pipeline_end_idx = sum(config.pipeline_num_layers[:pp_rank + 1])
+
+        config.num_layers = local_num_layers
+        config.pre_process = mpu.is_pipeline_first_stage()
+        config.post_process = mpu.is_pipeline_last_stage()
+        config.global_layer_idx = tuple(range(pipeline_start_idx, pipeline_end_idx))
+
+        return config
\ No newline at end of file
diff --git a/mindspeed_mm/models/qwen2vl_model.py b/mindspeed_mm/models/qwen2vl_model.py
index 0828165f..28d9a2fc 100644
--- a/mindspeed_mm/models/qwen2vl_model.py
+++ b/mindspeed_mm/models/qwen2vl_model.py
@@ -271,17 +271,17 @@ class Qwen2VLModel(MultiModalModule):
 
 
     def forward(
-        self,
-        input_ids: torch.Tensor,
-        pixel_values: Optional[torch.Tensor] = None,
-        image_grid_thw: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        labels: Optional[torch.Tensor] = None,
-        inference_params: Optional[InferenceParams] = None,
-        decoder_input: Optional[torch.FloatTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        packed_seq_params: Optional[PackedSeqParams] = None,
-        extra_block_kwargs: Optional[dict] = None,
+            self,
+            input_ids: torch.Tensor,
+            pixel_values: Optional[torch.Tensor] = None,
+            image_grid_thw: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            labels: Optional[torch.Tensor] = None,
+            inference_params: Optional[InferenceParams] = None,
+            decoder_input: Optional[torch.FloatTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            packed_seq_params: Optional[PackedSeqParams] = None,
+            extra_block_kwargs: Optional[dict] = None,
     ) -> Union[Dict[str, torch.Tensor], torch.Tensor]:
 
         vit_embeds = None
diff --git a/mindspeed_mm/models/sora_model.py b/mindspeed_mm/models/sora_model.py
index 86105c94..5482ae7a 100644
--- a/mindspeed_mm/models/sora_model.py
+++ b/mindspeed_mm/models/sora_model.py
@@ -52,14 +52,13 @@ class SoRAModel(nn.Module):
         self.config = core_transformer_config_from_args(get_args())
         self.task = config.task if hasattr(config, "task") else "t2v"
 
-        self.pp_size = mpu.get_pipeline_model_parallel_world_size()
         if mpu.get_virtual_pipeline_model_parallel_world_size() is not None:
             raise NotImplementedError("Not support virtual_pipeline_model_parallel now. ")
         else:
             self.pp_rank = mpu.get_pipeline_model_parallel_rank()
 
-        self.pre_process = True
-        self.post_process = True
+        self.pre_process = mpu.is_pipeline_first_stage()
+        self.post_process = mpu.is_pipeline_last_stage()
         self.input_tensor = None
         # to avoid grad all-reduce and reduce-scatter in megatron, since SoRAModel has no embedding layer.
         self.share_embeddings_and_output_weights = False
@@ -77,30 +76,7 @@ class SoRAModel(nn.Module):
                 self.text_encoder.requires_grad_(False)
 
         self.diffusion = DiffusionModel(config.diffusion).get_model()
-        self.predictor = self._build_predictor_layers(config.predictor)
-
-    def _build_predictor_layers(self, config):
-        self.predictor_cls = config.model_id
-        if self.pp_size <= 1:
-            return PredictModel(config).get_model()
-
-        local_num_layers = config.pipeline_num_layers[self.pp_rank]
-        if local_num_layers <= 0:
-            raise ValueError(f"for pp_rank {self.pp_rank}, the predictor layer is {local_num_layers}, "
-                             f"which is invalid. ")
-
-        pipeline_start_idx = sum(config.pipeline_num_layers[:self.pp_rank])
-        pipeline_end_idx = sum(config.pipeline_num_layers[:self.pp_rank + 1])
-        self.pre_process = pipeline_start_idx == 0
-        self.post_process = pipeline_end_idx == config.num_layers
-
-        config.num_layers = local_num_layers
-        config.pre_process, config.post_process = self.pre_process, self.post_process
-        config.global_layer_idx = tuple(range(pipeline_start_idx, pipeline_end_idx))
-        if len(config.global_layer_idx) != local_num_layers:
-            raise ValueError("The number of global_layer_idx is not equal to local_num_layers")
-
-        return PredictModel(config=config).get_model()
+        self.predictor = PredictModel(config.predictor).get_model()
 
     def set_input_tensor(self, input_tensor):
         self.input_tensor = input_tensor
@@ -160,25 +136,24 @@ class SoRAModel(nn.Module):
         )
 
         if self.post_process:
-            timesteps = timesteps.to(torch.int64)
             loss = self.compute_loss(
                 output if isinstance(output, torch.Tensor) else output[0],
                 latents,
                 noised_latents,
                 timesteps,
                 noise,
-                video_mask
+                video_mask,
+                **kwargs
             )
             return [loss]
 
-        timesteps = timesteps.to(torch.bfloat16)
         return self.predictor.pipeline_set_next_stage_tensor(
             input_list=[latents, noised_latents, timesteps, noise, video_mask],
             output_list=output,
             extra_kwargs=kwargs)
 
     def compute_loss(
-        self, model_output, latents, noised_latents, timesteps, noise, video_mask
+        self, model_output, latents, noised_latents, timesteps, noise, video_mask, **kwargs
     ):
         """compute diffusion loss"""
         loss_dict = self.diffusion.training_losses(
@@ -188,6 +163,7 @@ class SoRAModel(nn.Module):
             noise=noise,
             t=timesteps,
             mask=video_mask,
+            **kwargs
         )
         return loss_dict
     
diff --git a/mindspeed_mm/tasks/evaluation/eval_datasets/__init__.py b/mindspeed_mm/tasks/evaluation/eval_datasets/__init__.py
index d4c67aa9..39bb2ccc 100644
--- a/mindspeed_mm/tasks/evaluation/eval_datasets/__init__.py
+++ b/mindspeed_mm/tasks/evaluation/eval_datasets/__init__.py
@@ -1,6 +1,6 @@
-from .datasets_base import BaseEvalDataset
 from .datasets_mmmu import MMMUEvalDataset
 from .datasets_vqa import VQAEvalDataset
+from .datasets_ai2d import AI2DEvalDataset
 
-eval_dataset_dict = {"mmmu_dev_val": MMMUEvalDataset, "ai2d_test": BaseEvalDataset, "chartqa_test": VQAEvalDataset,
+eval_dataset_dict = {"mmmu_dev_val": MMMUEvalDataset, "ai2d_test": AI2DEvalDataset, "chartqa_test": VQAEvalDataset,
                      "docvqa_val": VQAEvalDataset}
diff --git a/mindspeed_mm/tasks/evaluation/eval_prompt/__init__.py b/mindspeed_mm/tasks/evaluation/eval_prompt/__init__.py
index db378600..c625c440 100644
--- a/mindspeed_mm/tasks/evaluation/eval_prompt/__init__.py
+++ b/mindspeed_mm/tasks/evaluation/eval_prompt/__init__.py
@@ -1,4 +1,3 @@
-from .build_prompt_base import BasePromptTemplate
 from .build_prompt_llava import LlavaPromptTemplate
 from .build_prompt_internvl import InternvlPromptTemplate
 from .build_prompt_qwen2vl import Qwen2vlPromptTemplate
diff --git a/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_base.py b/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_base.py
index 1eaff983..6d02b344 100644
--- a/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_base.py
+++ b/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_base.py
@@ -1,3 +1,4 @@
+from typing import Callable
 import string
 
 import torch
@@ -12,6 +13,9 @@ class BasePromptTemplate:
     def __init__(self):
         self.device = torch.cuda.current_device()
 
+    def build_prompt(self, line, dump_image: Callable, dataset_name=None):
+        raise NotImplementedError('you must implement build_prompt')
+
     @staticmethod
     def check_content_type(message):
         """Check the content type of the input. Four types are allowed: str, dict, ListOfString, ListOfDict.
diff --git a/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_internvl.py b/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_internvl.py
index bd8179a6..b759553c 100644
--- a/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_internvl.py
+++ b/mindspeed_mm/tasks/evaluation/eval_prompt/build_prompt_internvl.py
@@ -3,9 +3,6 @@ from typing import Callable
 from mindspeed_mm.tasks.evaluation.eval_prompt.build_prompt_base import BasePromptTemplate
 from mindspeed_mm.tasks.evaluation.eval_datasets.datasets_base import datasets_type
 
-IMAGENET_MEAN = (0.485, 0.456, 0.406)
-IMAGENET_STD = (0.229, 0.224, 0.225)
-
 
 class InternvlPromptTemplate(BasePromptTemplate):
 
diff --git a/mindspeed_mm/tasks/inference/pipeline/cogvideox_pipeline.py b/mindspeed_mm/tasks/inference/pipeline/cogvideox_pipeline.py
index 7af286b7..67a2d785 100644
--- a/mindspeed_mm/tasks/inference/pipeline/cogvideox_pipeline.py
+++ b/mindspeed_mm/tasks/inference/pipeline/cogvideox_pipeline.py
@@ -48,7 +48,6 @@ class CogVideoXPipeline(MMPipeline, InputsCheckMixin, MMEncoderMixin):
         self.num_frames, self.height, self.width = config.get("input_size", [49, 480, 720])
         self.generator = torch.Generator().manual_seed(config.get("seed", 42))
         self.num_videos_per_prompt = 1
-        self.max_sequence_length = 226
         self.guidance_scale = config.get("guidance_scale", 6.0)
 
         self.scheduler.use_dynamic_cfg = config.get("use_dynamic_cfg", True)
@@ -105,10 +104,7 @@ class CogVideoXPipeline(MMPipeline, InputsCheckMixin, MMEncoderMixin):
         negative_prompt_embeds: Optional[torch.FloatTensor] = None,
         **kwargs
     ):
-        if self.num_frames > 49:
-            raise ValueError(
-                "The number of frames must be less than 49 for now due to static positional embeddings. This will be updated in the future to remove this limitation."
-            )
+        self.max_sequence_length = kwargs.pop("max_sequence_length", 226)
 
         height = self.height or self.predict_model.config.sample_size * self.vae_scale_factor_spatial
         width = self.width or self.predict_model.config.sample_size * self.vae_scale_factor_spatial
diff --git a/mindspeed_mm/tools/README.md b/mindspeed_mm/tools/README.md
index 2a7db53b..9115335e 100644
--- a/mindspeed_mm/tools/README.md
+++ b/mindspeed_mm/tools/README.md
@@ -55,6 +55,7 @@ prof.stop()
     --start_step                # 设置启动采集的步数
     --end_step                  # 设置结束采集的步数
     --data_simplification       # 采集时是否采用简化数据
+    --aic_metrics_type          # 采集模式，目前支持PipeUtilization和ArithmeticUtilization两种，默认采用PipeUtilization
     ```
 
 3. 运行模型并采集profiling文件
diff --git a/mindspeed_mm/tools/profiler.py b/mindspeed_mm/tools/profiler.py
index 0f24b593..e69c0507 100644
--- a/mindspeed_mm/tools/profiler.py
+++ b/mindspeed_mm/tools/profiler.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2024 Huawei Technologies Co., Ltd.
+# coding=utf-8
+# Copyright (c) 2024, HUAWEI CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -64,7 +65,9 @@ class Profiler:
         self.sp_data_simplification = config.static_param.data_simplification
 
         self.dp_config_path = config.dynamic_param.config_path
-
+        
+        self.aic_metrics_type = config.static_param.aic_metrics_type
+        
         if self.profile_type == "static":
             if self.sp_level == 'level0':
                 profiler_level = torch_npu.profiler.ProfilerLevel.Level0
@@ -75,9 +78,14 @@ class Profiler:
             else:
                 raise ValueError(f"profiler_level only supports level0,"
                                 f" 1, and 2, but gets {self.sp_level}")
-
+            if self.aic_metrics_type == 'PipeUtilization':
+                aic_metrics_type = torch_npu.profiler.AiCMetrics.PipeUtilization
+            elif self.aic_metrics_type == 'ArithmeticUtilization':
+                aic_metrics_type = torch_npu.profiler.AiCMetrics.ArithmeticUtilization
+            else:
+                raise ValueError(f"aic_metrics_type only supports PipeUtilization and ArithmeticUtilization")
             experimental_config = torch_npu.profiler._ExperimentalConfig(
-                aic_metrics=torch_npu.profiler.AiCMetrics.ArithmeticUtilization,
+                aic_metrics=aic_metrics_type,
                 profiler_level=profiler_level,
                 data_simplification=self.sp_data_simplification,
             )
diff --git a/mindspeed_mm/tools/tools.json b/mindspeed_mm/tools/tools.json
index caf4bd02..0858b403 100644
--- a/mindspeed_mm/tools/tools.json
+++ b/mindspeed_mm/tools/tools.json
@@ -12,7 +12,8 @@
             "save_path": "./npu_profiling",
             "start_step": 10,
             "end_step": 11,
-            "data_simplification": false
+            "data_simplification": false,
+            "aic_metrics_type": "PipeUtilization"
         },
         "dynamic_param": {
             "config_path": "path to dynamic config folder"
diff --git a/redis_utils.py b/redis_utils.py
deleted file mode 100644
index 3d6eb621..00000000
--- a/redis_utils.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import redis
-import json
-import time
-
-from concurrent.futures import ThreadPoolExecutor
-
-
-class RedisManager:
-    def __init__(self, host, port, db, model_name, ping_interval=10):
-        self.conn_pool = redis.ConnectionPool(host=host, port=int(port), db=db)
-        self.last_ping_time = time.time()
-        self.model_name = model_name
-        self.lock_prefix = 'lock:'
-        self.ping_interval = ping_interval
-
-    def _ping(self):
-        current_time = time.time()
-        if current_time - self.last_ping_time > self.ping_interval:
-            redis_conn = redis.Redis(connection_pool=self.conn_pool)
-            redis_conn.ping()
-            redis_conn.close()
-            self.last_ping_time = current_time
-
-    def write_data_to_result(self, model_name, message_id, data):
-        while True:
-            try:
-                self._ping()
-                break
-            except Exception as _:
-                continue
-        r = redis.Redis(connection_pool=self.conn_pool)
-        if message_id != "dummy":
-            r.hset(model_name + "_result", message_id, json.dumps(data))
-        r.close()
-
-    def fetch_single_request_without_queue(self):
-        r = redis.Redis(connection_pool=self.conn_pool)
-
-        request_data = None
-
-        # Directly access the fixed hash
-        message_ids = r.hkeys(self.model_name)
-        for message_id in message_ids:
-            lock_key = self.lock_prefix + message_id.decode("utf-8")
-            # Try to acquire a lock for the field
-            if r.setnx(lock_key, 1):  # If successfully set, we have acquired the lock
-                r.expire(lock_key, 10)  # Set an expiration time for the lock to avoid deadlocks
-                try:
-                    result = r.hget(self.model_name, message_id)
-                    if result:
-                        request_data = json.loads(result.decode("utf-8"))
-                        request_data["main_key"] = self.model_name
-                        request_data["message_id"] = message_id.decode("utf-8")
-                        # Delete the field after retrieval
-                        r.hdel(self.model_name, message_id)
-                finally:
-                    # Release the lock
-                    r.delete(lock_key)
-                break
-
-        r.close()
-
-        return request_data
-
-    def fetch_batch_of_requests(self, batch_size=4):
-        while True:
-            try:
-                self._ping()
-                break
-            except Exception as e:
-                print("Fail to connect Redis, retry")
-                continue
-
-        batch = []
-
-        with ThreadPoolExecutor(max_workers=batch_size) as executor:
-            futures = [
-                executor.submit(self.fetch_single_request_without_queue) for _ in range(batch_size)
-            ]
-
-            for future in futures:
-                result = future.result()
-                if result:
-                    batch.append(result)
-
-        return batch
diff --git a/tests/st/shell_scripts/finetune_internvl2_8B.sh b/tests/st/shell_scripts/finetune_internvl2_8B.sh
index 8328cb77..567c957d 100644
--- a/tests/st/shell_scripts/finetune_internvl2_8B.sh
+++ b/tests/st/shell_scripts/finetune_internvl2_8B.sh
@@ -6,15 +6,14 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export CUDA_DEVICE_MAX_CONNECTIONS=1
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 
-GPUS_PER_NODE=8
+NPUS_PER_NODE=8
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 MBS=1
 GRAD_ACC_STEP=64
@@ -38,7 +37,7 @@ MM_ARGS="
 "
 
 DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
+    --nproc_per_node $NPUS_PER_NODE \
     --nnodes $NNODES \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
diff --git a/tests/st/shell_scripts/finetune_qwen2vl_7B.sh b/tests/st/shell_scripts/finetune_qwen2vl_7B.sh
index 56f58202..a8fe877b 100644
--- a/tests/st/shell_scripts/finetune_qwen2vl_7B.sh
+++ b/tests/st/shell_scripts/finetune_qwen2vl_7B.sh
@@ -8,7 +8,6 @@ export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export NPU_ASD_ENABLE=0
 export ASCEND_LAUNCH_BLOCKING=0
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 
 NPUS_PER_NODE=8
diff --git a/tests/st/shell_scripts/inference_qwen2vl_7b_pp1.sh b/tests/st/shell_scripts/inference_qwen2vl_7b_pp1.sh
index db0193e6..990a1e9c 100644
--- a/tests/st/shell_scripts/inference_qwen2vl_7b_pp1.sh
+++ b/tests/st/shell_scripts/inference_qwen2vl_7b_pp1.sh
@@ -1,8 +1,6 @@
 #!/bin/bash
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
-# 通过此配置选择使用的NPU卡
-# export ASCEND_RT_VISIBLE_DEVICES=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export ASCEND_SLOG_PRINT_TO_STDOUT=0
 export ASCEND_GLOBAL_LOG_LEVEL=3
@@ -12,7 +10,6 @@ export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export NPU_ASD_ENABLE=0
 export ASCEND_LAUNCH_BLOCKING=0
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 
diff --git a/tests/st/shell_scripts/inference_qwen2vl_7b_pp4.sh b/tests/st/shell_scripts/inference_qwen2vl_7b_pp4.sh
index 023bbf5b..a7bf929e 100644
--- a/tests/st/shell_scripts/inference_qwen2vl_7b_pp4.sh
+++ b/tests/st/shell_scripts/inference_qwen2vl_7b_pp4.sh
@@ -1,8 +1,6 @@
 #!/bin/bash
 source /usr/local/Ascend/ascend-toolkit/set_env.sh
 
-# 通过此配置选择使用的NPU卡
-# export ASCEND_RT_VISIBLE_DEVICES=1
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export ASCEND_SLOG_PRINT_TO_STDOUT=0
 export ASCEND_GLOBAL_LOG_LEVEL=3
@@ -12,7 +10,6 @@ export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 export NPU_ASD_ENABLE=0
 export ASCEND_LAUNCH_BLOCKING=0
-export HOST_CACHE_CAPACITY=20
 export ACLNN_CACHE_LIMIT=100000
 export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
 
diff --git a/tests/st/shell_scripts/pretrain_llava1_5.sh b/tests/st/shell_scripts/pretrain_llava1_5.sh
index 82b900ec..7a50120d 100644
--- a/tests/st/shell_scripts/pretrain_llava1_5.sh
+++ b/tests/st/shell_scripts/pretrain_llava1_5.sh
@@ -7,12 +7,12 @@ export COMBINED_ENABLE=1
 export CPU_AFFINITY_CONF=1
 export HCCL_CONNECT_TIMEOUT=1200
 
-GPUS_PER_NODE=4
+NPUS_PER_NODE=4
 MASTER_ADDR=localhost
 MASTER_PORT=6000
 NNODES=1
 NODE_RANK=0
-WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+WORLD_SIZE=$(($NPUS_PER_NODE*$NNODES))
 
 TP=1
 PP=1
@@ -27,7 +27,7 @@ MM_MODEL="$BASEPATH/tests/st/run_configs/pretrain_llava1_5/model.json"
 MM_TOOL="$BASEPATH/mindspeed_mm/tools/tools.json"
 
 DISTRIBUTED_ARGS="
-    --nproc_per_node $GPUS_PER_NODE \
+    --nproc_per_node $NPUS_PER_NODE \
     --nnodes $NNODES \
     --node_rank $NODE_RANK \
     --master_addr $MASTER_ADDR \
diff --git a/tests/ut/models/common/embeddings/test_cogvideox_pos_emb.py b/tests/ut/models/common/embeddings/test_cogvideox_pos_emb.py
new file mode 100644
index 00000000..060a4cf7
--- /dev/null
+++ b/tests/ut/models/common/embeddings/test_cogvideox_pos_emb.py
@@ -0,0 +1,10 @@
+import mindspeed.megatron_adaptor
+
+from mindspeed_mm.models.common.embeddings.pos_embeddings import Rotary3DPositionEmbedding
+from tests.ut.utils import judge_expression
+
+
+class TestCogVideoXRope:
+    def test_init_rope_1_0_t2v(self):
+        rope = Rotary3DPositionEmbedding(30, 45, 1, 3072, 64, 226, learnable_pos_embed=False)
+        judge_expression(isinstance(rope, Rotary3DPositionEmbedding))
diff --git a/tests/ut/models/diffusion/test_cogvideo_diffusion.py b/tests/ut/models/diffusion/test_cogvideo_diffusion.py
new file mode 100644
index 00000000..2fb7d3d0
--- /dev/null
+++ b/tests/ut/models/diffusion/test_cogvideo_diffusion.py
@@ -0,0 +1,121 @@
+import torch
+import pytest
+import numpy as np
+
+from mindspeed_mm.models.diffusion.cogvideo_diffusion import append_dims, default, append_zero, \
+    generate_roughly_equally_spaced_steps, EpsWeighting, make_beta_schedule
+from tests.ut.utils import judge_expression
+
+
+class TestCogvideoDiffusion:
+
+    def test_append_dims_no_append_needed(self):
+        """Test when no dimensions need to be appended."""
+        x = torch.tensor([1, 2, 3])
+        result = append_dims(x, 1)
+        judge_expression(result.shape == (3,))
+        judge_expression(torch.equal(result, x))
+
+    def test_append_dims_append_one_dim(self):
+        """Test appending one dimension."""
+        x = torch.tensor([1, 2, 3])
+        result = append_dims(x, 2)
+        expected_shape = (3, 1)
+        judge_expression(result.shape == expected_shape)
+
+    def test_append_dims_append_multiple_dims(self):
+        """Test appending multiple dimensions."""
+        x = torch.tensor([1, 2, 3])
+        target_dims = 5
+        result = append_dims(x, target_dims)
+        expected_shape = (3,) + (1,) * (target_dims - x.ndim)
+        judge_expression(result.shape == expected_shape)
+
+    def test_append_dims_already_higher_dims(self):
+        """Test when the input already has more dimensions than target_dims."""
+        x = torch.rand(2, 3, 4)
+        with pytest.raises(ValueError):
+            append_dims(x, 2)
+
+    def test_append_dims_target_dims_equal_input_dims(self):
+        """Test when target_dims is equal to the number of dimensions in the input."""
+        x = torch.rand(2, 3, 4)
+        result = append_dims(x, 3)
+        judge_expression(result.shape == (2, 3, 4))
+        judge_expression(torch.equal(result, x))
+
+    def test_default_val_not_none(self):
+        """Test when val is not None."""
+        result = default(5, lambda: 10)
+        judge_expression(result == 5)
+
+    def test_default_val_none_d_is_function(self):
+        """Test when val is None and d is a function."""
+        def func():
+            return "default_value"
+
+        result = default(None, func)
+        judge_expression(result == "default_value")
+
+    def test_default_val_none_d_is_none(self):
+        """Test when both val and d are None."""
+        result = default(None, None)
+        judge_expression(result is None)
+
+    def test_append_zero_regular_tensor(self):
+        """Test appending zero to a regular tensor."""
+        x = torch.tensor([1, 2, 3])
+        result = append_zero(x)
+        expected = torch.tensor([1, 2, 3, 0])
+        judge_expression(torch.equal(result, expected))
+
+    def test_append_zero_multidimensional_tensor(self):
+        """Test appending zero to a multidimensional tensor."""
+        x = torch.tensor([[1, 2], [3, 4]])
+        with pytest.raises(RuntimeError):
+            append_zero(x)
+
+    def test_append_zero_empty_tensor(self):
+        """Test appending zero to an empty tensor."""
+        x = torch.tensor([])
+        result = append_zero(x)
+        expected = torch.tensor([0])
+        judge_expression(torch.equal(result, expected))
+
+    def test_generate_roughly_equally_spaced_steps_case(self):
+        """Test with a normal case."""
+        num_substeps = 5
+        max_step = 10
+        result = generate_roughly_equally_spaced_steps(num_substeps, max_step)
+        expected = np.array([1, 3, 5, 7, 9])
+        judge_expression(np.array_equal(result, expected))
+
+    def test_eps_weighting_positive_input(self):
+        """Test with a positive input."""
+        weighting = EpsWeighting()
+        sigma = 2.0
+        result = weighting(sigma)
+        expected = sigma ** -2.0
+        judge_expression(np.isclose(result, expected))
+
+    def test_make_beta_schedule_linear(self):
+        """Test with linear schedule."""
+        n_timestep = 10
+        betas = make_beta_schedule("linear", n_timestep)
+        expected = np.linspace(1e-4 ** 0.5, 2e-2 ** 0.5, n_timestep) ** 2
+        judge_expression(np.allclose(betas, expected))
+
+    def test_make_beta_schedule_custom_params(self):
+        """Test with custom linear_start and linear_end."""
+        n_timestep = 10
+        linear_start = 1e-3
+        linear_end = 5e-2
+        betas = make_beta_schedule("linear", n_timestep, linear_start, linear_end)
+        expected = np.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep) ** 2
+        judge_expression(np.allclose(betas, expected))
+
+    def test_make_beta_schedule_no_linear(self):
+        """Test with no linear schedule."""
+        n_timestep = 10
+        with pytest.raises(NotImplementedError):
+            make_beta_schedule("cosine", n_timestep)
-- 
Gitee


From 51d432641fa9f8db5efe12bc55f1a37f075cb6e3 Mon Sep 17 00:00:00 2001
From: Luo Yiyang <luoyiyang2@huawei.com>
Date: Mon, 30 Dec 2024 17:21:53 +0800
Subject: [PATCH 4/4] fix conflict

---
 .../i2v_1.0/model_cogvideox_i2v.json          | 128 ++++++++++++++++++
 .../t2v_1.0/model_cogvideox_t2v.json          |   1 -
 .../models/ae/contextparallelcausalvae.py     |   2 +-
 3 files changed, 129 insertions(+), 2 deletions(-)

diff --git a/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json b/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json
index e69de29b..56aa80ad 100644
--- a/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json
+++ b/examples/cogvideox/i2v_1.0/model_cogvideox_i2v.json
@@ -0,0 +1,128 @@
+{
+    "frames": 25,
+    "resolution": [480, 720],
+    "allow_tf32": true,
+    "allow_internal_format":false,
+    "load_video_features": false,
+    "load_text_features": false,
+    "task": "i2v", 
+    "predictor": {
+        "model_id": "satdit",
+        "from_pretrained": null,
+        "dtype": "bf16",
+        "num_layers": 42,
+        "num_heads": 48,
+        "head_dim": 64,
+        "in_channels": 32,
+        "out_channels": 16,
+        "dropout": 0.0,
+        "cross_attention_dim": null,
+        "attention_bias": true,
+        "input_size": [7, 60, 90],
+        "patch_size": [1, 2, 2],
+        "activation_fn": "gelu-approximate",
+        "num_embeds_ada_norm": 1000,
+        "norm_type": "qk_ln",
+        "norm_elementwise_affine": true,
+        "norm_eps": 1e-5,
+        "caption_channels": null,
+        "time_embed_dim": 512,
+        "text_length": 226,
+        "text_hidden_size": 4096,
+        "concat_text_embed": true,
+        "interpolation_scale": [1.0, 1.0, 1.0],
+        "learnable_pos_embed": true,
+        "use_rope": true
+    },
+    "diffusion": {
+        "model_id": "cogvideo_diffusion",
+        "sigma_sampler_config": {
+            "uniform_sampling": true,
+            "num_idx": 1000,
+            "discretization_config":{
+                "shift_scale": 1.0
+            }
+        },
+        "denoiser_config": {
+            "num_idx": 1000,
+            "quantize_c_noise": false,
+            "discretization_config":{
+                "shift_scale": 1.0
+            }
+        }
+    },
+    "text_encoder": {
+        "model_id": "T5",
+        "hub_backend": "hf",
+        "from_pretrained": "5b-cogvideo",
+        "dtype": "bf16",
+        "load_in_8bit": false,
+        "low_cpu_mem_usage": true,
+        "ucg_rate": 0.1
+    },
+   "ae": {
+        "model_id": "contextparallelcasualvae",
+        "from_pretrained": "3d-vae.pt",
+        "cp_size": 1,
+        "dtype": "bf16",
+        "z_channels": 16,
+        "conv_padding": 0,
+        "num_res_blocks": 3,
+        "hidden_size_mult": [1,2,2,4],
+        "encoder_attention": "",
+        "encoder_nonlinearity": "swish",
+        "encoder_conv_in": "ContextParallelCausalConv3d",
+        "encoder_conv_out": "ContextParallelCausalConv3d",
+        "encoder_mid_resnet": "ContextParallelResnetBlock3D",
+        "encoder_resnet_blocks": [
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D"
+        ],
+        "encoder_spatial_downsample": [
+            "DownSample3D",
+            "DownSample3D",
+            "DownSample3D",
+            ""
+        ],
+        "encoder_temporal_downsample": [
+            "",
+            "",
+            "",
+            ""
+        ],
+        "decoder_attention": "",
+        "decoder_nonlinearity": "swish",
+        "decoder_conv_in": "ContextParallelCausalConv3d",
+        "decoder_conv_out": "ContextParallelCausalConv3d",
+        "decoder_mid_resnet": "ContextParallelResnetBlock3D",
+        "decoder_resnet_blocks": [
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D",
+            "ContextParallelResnetBlock3D"
+        ],
+        "decoder_spatial_upsample": [
+            "",
+            "Upsample3D",
+            "Upsample3D",
+            "Upsample3D"
+        ],
+        "decoder_temporal_upsample": [
+            "",
+            "",
+            "",
+            ""
+        ],
+        "encoder_gather_norm": true,
+        "decoder_gather_norm": true,
+        "use_quant_layer": false,
+        "i2v_processor": {
+            "processor_id": "cogvideox_i2v_processor",
+            "noised_image_all_concat": false,
+            "noised_image_dropout": 0.05,
+            "noised_image_input": true
+        }
+    }
+}
\ No newline at end of file
diff --git a/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json b/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json
index cfe12471..367f3f06 100644
--- a/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json
+++ b/examples/cogvideox/t2v_1.0/model_cogvideox_t2v.json
@@ -37,7 +37,6 @@
         "model_id": "cogvideo_diffusion",
         "sigma_sampler_config": {
             "uniform_sampling": true,
-            "group_num": 8,
             "num_idx": 1000,
             "discretization_config":{
                 "shift_scale": 1.0
diff --git a/mindspeed_mm/models/ae/contextparallelcausalvae.py b/mindspeed_mm/models/ae/contextparallelcausalvae.py
index bad49ca0..7716f483 100644
--- a/mindspeed_mm/models/ae/contextparallelcausalvae.py
+++ b/mindspeed_mm/models/ae/contextparallelcausalvae.py
@@ -773,7 +773,7 @@ class Decoder(nn.Module):
             for i_block in range(self.num_res_blocks + 1):
                 h = self.up[i_level].block[i_block](h, zq=zq, enable_cp=enable_cp)
                 if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h, zq=zq, enable_cp=enable_cp)
+                    h = self.up[i_level].attn[i_block](h, zq=zq)
             if hasattr(self.up[i_level], "upsample"):
                 h = self.up[i_level].upsample(h, enable_cp=enable_cp)
             if hasattr(self.up[i_level], "time_upsample"):
-- 
Gitee