diff --git a/README-ja-JP.md b/README-ja-JP.md
index a9327ecc..009f1d27 100644
--- a/README-ja-JP.md
+++ b/README-ja-JP.md
@@ -2,7 +2,7 @@
 
 <div align="center">
 
-<img src="./doc/imgs/InternEvo_logo.png" width="200"/>
+<img src="./doc/imgs/InternEvo_logo.png" width="800"/>
 
 [![Documentation Status](https://readthedocs.org/projects/internevo/badge/?version=latest)](https://internevo.readthedocs.io/zh_CN/latest/?badge=latest)
 [![license](./doc/imgs/license.svg)](./LICENSE)
@@ -143,6 +143,10 @@ $ torchrun --nnodes=1 --nproc_per_node=8 train.py --config ./configs/7B_sft.py -
       <ul>
         <li><a href="configs/_base_/models/internlm/internlm_7B.py">InternLM</a></li>
         <li><a href="configs/_base_/models/internlm/internlm2_7B.py">InternLM2</a></li>
+        <li><a href="configs/7B_llama2.py">Llama2</a></li>
+        <li><a href="configs/7B_qwen2.py">Qwen2</a></li>
+        <li><a href="configs/7B_baichuan2.py">Baichuan2</a></li>
+        <li><a href="configs/7B_gemma.py">gemma</a></li>
       </ul>
       </td>
       <td>
diff --git a/README-zh-Hans.md b/README-zh-Hans.md
index ea38b04e..652cd8db 100644
--- a/README-zh-Hans.md
+++ b/README-zh-Hans.md
@@ -2,7 +2,7 @@
 
 <div align="center">
 
-<img src="./doc/imgs/InternEvo_logo.png" width="200"/>
+<img src="./doc/imgs/InternEvo_logo.png" width="800"/>
 
 [![使用文档](https://readthedocs.org/projects/internevo/badge/?version=latest)](https://internevo.readthedocs.io/zh_CN/latest/?badge=latest)
 [![license](./doc/imgs/license.svg)](./LICENSE)
@@ -143,6 +143,10 @@ $ torchrun --nnodes=1 --nproc_per_node=8 train.py --config ./configs/7B_sft.py -
       <ul>
         <li><a href="configs/_base_/models/internlm/internlm_7B.py">InternLM</a></li>
         <li><a href="configs/_base_/models/internlm/internlm2_7B.py">InternLM2</a></li>
+        <li><a href="configs/7B_llama2.py">Llama2</a></li>
+        <li><a href="configs/7B_qwen2.py">Qwen2</a></li>
+        <li><a href="configs/7B_baichuan2.py">Baichuan2</a></li>
+        <li><a href="configs/7B_gemma.py">gemma</a></li>
       </ul>
       </td>
       <td>
diff --git a/README.md b/README.md
index 9a61d573..f04f6f14 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 <div align="center">
 
-<img src="./doc/imgs/InternEvo_logo.png" width="200"/>
+<img src="./doc/imgs/InternEvo_logo.png" width="800"/>
 
 [![Documentation Status](https://readthedocs.org/projects/internevo/badge/?version=latest)](https://internevo.readthedocs.io/zh_CN/latest/?badge=latest)
 [![license](./doc/imgs/license.svg)](./LICENSE)
@@ -143,6 +143,10 @@ Please refer to the [System Architecture document](./doc/en/structure.md) for ar
       <ul>
         <li><a href="configs/_base_/models/internlm/internlm_7B.py">InternLM</a></li>
         <li><a href="configs/_base_/models/internlm/internlm2_7B.py">InternLM2</a></li>
+        <li><a href="configs/7B_llama2.py">Llama2</a></li>
+        <li><a href="configs/7B_qwen2.py">Qwen2</a></li>
+        <li><a href="configs/7B_baichuan2.py">Baichuan2</a></li>
+        <li><a href="configs/7B_gemma.py">gemma</a></li>
       </ul>
       </td>
       <td>
diff --git a/configs/7B_baichuan2.py b/configs/7B_baichuan2.py
new file mode 100644
index 00000000..fdc1b0ab
--- /dev/null
+++ b/configs/7B_baichuan2.py
@@ -0,0 +1,225 @@
+JOB_NAME = "7b_baichuan2_train"
+model_type = "BAICHUAN2"
+DO_ALERT = False
+
+VOCAB_SIZE = 125696
+SEQ_LEN = 2048
+HIDDEN_SIZE = 4096
+NUM_ATTENTION_HEAD = 32
+MLP_RATIO = 8 / 3
+NUM_LAYER = 32
+
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts_baichuan2/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts_baichuan2"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
+    # load function such as "llama"
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="hf"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=False,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = None
+VALID_FOLDER = None  # "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=0,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    # rampup_batch_size (str): A string with three space-separated integers representing the
+    #       starting batch size, the increment, and the number of steps between
+    #       each increment. For example, "192 24 8" means that the batch size (micro_num)
+    #       starts at 192 and increases by 24 every 8 steps. Defaults to None.
+    #       (IMPORTANT): The interval step size is 'micro_bsz'.
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    train_folder=TRAIN_FOLDER,
+    valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=200,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+use_fp32_norm = False
+model = dict(
+    checkpoint=False,
+    num_chunks=1,
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    no_bias=True,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-6,
+    use_flash_attn=True,
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
+)
+
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
+        defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
+        msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
+        fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
+        isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+weight parallel (dict):
+    1. size: int, the size of weight parallel.
+    2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. memory_pool: bool, enable/disable memory pool, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1),
+    tensor=dict(size=1, mode="mtp"),
+    pipeline=dict(size=1, interleaved_overlap=True),
+    weight=dict(size=1, overlap=True, memory_pool=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+        alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
+    ),
+    tensorboard=dict(
+        queue_max_length=10,
+    ),
+)
+
+# metric_dtype can be "fp32" or other string
+# only when set to "fp32" will use fp32 to calc in metrics
+# metric_dtype = "fp32"
+
+generation = dict(
+    ckpt_folder="/path/to/saved/ckpt",
+    output_folder="/path/to/save/generation",
+    batch_size=1,
+    eos_id=[2, 0],
+    bos_id=1,
+    max_length=100,
+    do_sample=True,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    repetition_penalty=1,
+    length_penalty=1.0,
+)
diff --git a/configs/7B_gemma.py b/configs/7B_gemma.py
new file mode 100644
index 00000000..6ef8c99b
--- /dev/null
+++ b/configs/7B_gemma.py
@@ -0,0 +1,232 @@
+JOB_NAME = "7b_gemma_train"
+model_type = "GEMMA"
+DO_ALERT = False
+
+VOCAB_SIZE = 256000
+SEQ_LEN = 2048
+HIDDEN_SIZE = 3072
+NUM_ATTENTION_HEAD = 16
+NUM_KV_ATTENTION_HEAD = 16
+HEAD_DIM = 256
+MLP_RATIO = 8
+NUM_LAYER = 28
+
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts_gemma/xxxx"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts_gemma"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
+    # load function such as "llama"
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="hf"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=False,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = None
+VALID_FOLDER = None  # "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=0,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    # rampup_batch_size (str): A string with three space-separated integers representing the
+    #       starting batch size, the increment, and the number of steps between
+    #       each increment. For example, "192 24 8" means that the batch size (micro_num)
+    #       starts at 192 and increases by 24 every 8 steps. Defaults to None.
+    #       (IMPORTANT): The interval step size is 'micro_bsz'.
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    train_folder=TRAIN_FOLDER,
+    valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=200,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+use_fp32_norm = False
+model = dict(
+    checkpoint=False,
+    num_chunks=1,
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
+    max_position_embeddings=8192,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    no_bias=True,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",
+    add_unit_offset=True,
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-6,
+    head_dim=HEAD_DIM,
+    use_flash_attn=True,
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
+    use_swiglu=False,
+)
+
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
+        defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
+        msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
+        fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
+        isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+weight parallel (dict):
+    1. size: int, the size of weight parallel.
+    2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. memory_pool: bool, enable/disable memory pool, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1),
+    tensor=dict(size=1, mode="mtp"),
+    pipeline=dict(size=1, interleaved_overlap=True),
+    weight=dict(size=1, overlap=True, memory_pool=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+        alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
+    ),
+    tensorboard=dict(
+        queue_max_length=10,
+    ),
+)
+
+# metric_dtype can be "fp32" or other string
+# only when set to "fp32" will use fp32 to calc in metrics
+# metric_dtype = "fp32"
+
+generation = dict(
+    ckpt_folder="/path/to/saved/ckpt",
+    output_folder="/path/to/save/generation",
+    batch_size=1,
+    eos_id=[2, 0],
+    bos_id=1,
+    max_length=100,
+    do_sample=True,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    repetition_penalty=1,
+    length_penalty=1.0,
+)
diff --git a/configs/7B_qwen2.py b/configs/7B_qwen2.py
new file mode 100644
index 00000000..07e572d2
--- /dev/null
+++ b/configs/7B_qwen2.py
@@ -0,0 +1,232 @@
+JOB_NAME = "7b_qwen2_train"
+model_type = "QWEN2"
+DO_ALERT = False
+
+VOCAB_SIZE = 152064
+SEQ_LEN = 2048
+HIDDEN_SIZE = 3584
+NUM_ATTENTION_HEAD = 28
+NUM_KV_ATTENTION_HEAD = 4
+MLP_RATIO = 5.25
+NUM_LAYER = 28
+
+
+MODEL_ONLY_FOLDER = "local:llm_ckpts_qwen2/xxxx/"
+# Ckpt folder format:
+# fs: 'local:/mnt/nfs/XXX'
+SAVE_CKPT_FOLDER = "local:llm_ckpts_qwen2"
+
+# boto3 Ckpt folder format:
+# import os
+# BOTO3_IP = os.environ["BOTO3_IP"] # boto3 bucket endpoint
+# SAVE_CKPT_FOLDER = f"boto3:s3://model_weights.{BOTO3_IP}/internlm"
+CHECKPOINT_EVERY = 50
+ckpt = dict(
+    enable_save_ckpt=False,  # enable ckpt save.
+    enable_internevo2hf_ckpt=False, # enable ckpt save for huggingface format.
+    save_ckpt_folder=SAVE_CKPT_FOLDER,  # Path to save training ckpt.
+    # 'load_ckpt_info' setting guide:
+    # 1. the 'path' indicate ckpt path,
+    # 2. the 'content‘ means what states will be loaded, support: "model", "sampler", "optimizer", "scheduler", "all"
+    # 3. the ’ckpt_type‘ means the type of checkpoint to be loaded, support: "internevo", "hf", or other custom-defined
+    # load function such as "llama"
+    load_ckpt_info=dict(path=MODEL_ONLY_FOLDER, content=("model",), ckpt_type="hf"),
+    # 'auto_resume' is designed to automatically load the latest checkpoint from 'save_ckpt_folder' when encountering
+    # training interruptions/hangs caused by hardware failures, using a scheduling system (such as k8s/slurm)
+    # with an automatic restart mechanism upon training reboot.
+    # Please be aware that if `auto_resume` is not set (its default value is True), it will not load the checkpoint
+    # path specified in `load_ckpt_info` by default.
+    # If you want to initialize your model weights from another model, you must set `auto_resume` to False.
+    # If you want to train from scratch, please set `auto_resume` to False and 'load_ckpt_info' to None.
+    auto_resume=False,
+    checkpoint_every=CHECKPOINT_EVERY,
+    async_upload=True,  # async ckpt upload. (only work for boto3 ckpt)
+    async_upload_tmp_folder="/dev/shm/internlm_tmp_ckpt/",  # path for temporarily files during asynchronous upload.
+    oss_snapshot_freq=int(CHECKPOINT_EVERY / 2),  # snapshot ckpt save frequency.
+)
+
+TRAIN_FOLDER = None
+VALID_FOLDER = None  # "/path/to/dataset"
+data = dict(
+    seq_len=SEQ_LEN,
+    # micro_num means the number of micro_batch contained in one gradient update
+    micro_num=4,
+    # packed_length = micro_bsz * SEQ_LEN
+    micro_bsz=1,
+    # defaults to the value of micro_num
+    valid_micro_num=4,
+    # defaults to 0, means disable evaluate
+    valid_every=0,
+    pack_sample_into_one=False,
+    total_steps=20,
+    skip_batches="",
+    # rampup_batch_size (str): A string with three space-separated integers representing the
+    #       starting batch size, the increment, and the number of steps between
+    #       each increment. For example, "192 24 8" means that the batch size (micro_num)
+    #       starts at 192 and increases by 24 every 8 steps. Defaults to None.
+    #       (IMPORTANT): The interval step size is 'micro_bsz'.
+    rampup_batch_size="",
+    # Datasets with less than 50 rows will be discarded
+    min_length=50,
+    train_folder=TRAIN_FOLDER,
+    valid_folder=VALID_FOLDER,
+    empty_cache_and_diag_interval=200,
+    diag_outlier_ratio=1.1,
+)
+
+grad_scaler = dict(
+    fp16=dict(
+        # the initial loss scale, defaults to 2**16
+        initial_scale=2**16,
+        # the minimum loss scale, defaults to None
+        min_scale=1,
+        # the number of steps to increase loss scale when no overflow occurs
+        growth_interval=1000,
+    ),
+    # the multiplication factor for increasing loss scale, defaults to 2
+    growth_factor=2,
+    # the multiplication factor for decreasing loss scale, defaults to 0.5
+    backoff_factor=0.5,
+    # the maximum loss scale, defaults to None
+    max_scale=2**24,
+    # the number of overflows before decreasing loss scale, defaults to 2
+    hysteresis=2,
+)
+
+hybrid_zero_optimizer = dict(
+    # Enable low_level_optimzer overlap_communication
+    overlap_sync_grad=True,
+    overlap_sync_param=False,
+    # bucket size for nccl communication params
+    reduce_bucket_size=512 * 1024 * 1024,
+    # grad clipping
+    clip_grad_norm=1.0,
+)
+
+loss = dict(
+    label_smoothing=0,
+)
+
+adam = dict(
+    lr=1e-4,
+    adam_beta1=0.9,
+    adam_beta2=0.95,
+    adam_beta2_c=0,
+    adam_eps=1e-8,
+    weight_decay=0.01,
+)
+
+lr_scheduler = dict(
+    total_steps=data["total_steps"],
+    init_steps=0,  # optimizer_warmup_step
+    warmup_ratio=0.01,
+    eta_min=1e-5,
+    last_epoch=-1,
+)
+
+beta2_scheduler = dict(
+    init_beta2=adam["adam_beta2"],
+    c=adam["adam_beta2_c"],
+    cur_iter=-1,
+)
+
+use_fp32_norm = False
+model = dict(
+    checkpoint=False,
+    num_chunks=1,
+    num_attention_heads=NUM_ATTENTION_HEAD,
+    num_kv_attention_heads=NUM_KV_ATTENTION_HEAD,
+    embed_split_hidden=True,
+    vocab_size=VOCAB_SIZE,
+    embed_grad_scale=1,
+    parallel_output=True,
+    hidden_size=HIDDEN_SIZE,
+    num_layers=NUM_LAYER,
+    qkv_bias=True,
+    o_bias=False,
+    mlp_ratio=MLP_RATIO,
+    apply_post_layer_norm=False,
+    dtype="torch.bfloat16",
+    norm_type="rmsnorm",
+    layer_norm_epsilon=1e-6,
+    use_flash_attn=True,
+    # Whether the odd and even columns of the query and key in the model are normally interleaved.
+    # If it's True, the model's odd and even columns are normally ordered; if it's False,
+    # it means that the model has prematurely concatenated all odd columns and even columns in front
+    # and back, in order to improve the RoPE's computational efficiency.
+    # Example:
+    # qk_interleaved = True: q[-1] = [q1,q2,q3,q4,q5,q6,...], k[-1] = [k1,k2,k3,k4,k5,k6,...]
+    # qk_interleaved = False: q[-1] = [q1,q3,q5,...,q2,q4,q6,...], k[-1] = [k1,k3,k5,...,k2,k4,k6,...]
+    qk_interleaved=False,
+    rope_base=1000000,
+    use_sliding_window=False,
+    sliding_window=32768,
+    max_window_layers=28,
+)
+
+"""
+zero1 parallel (dict):
+    1. size: int
+        * if size <= 0, the size of the zero process group is equal to the size of the dp process group,
+            so parameters will be divided within the range of dp.
+        * if size == 1, zero is not used, and all dp groups retain the full amount of model parameters.
+        * if size > 1 and size <= dp world size, the world size of zero is a subset of dp world size.
+        For smaller models, it is usually a better choice to split the parameters within nodes with a setting <= 8.
+    2. fsdp: bool, enable/disable torch's fully sharded data parallel, defaults to False.
+tensor parallel (dict):
+    1. size: int, the size of tensor parallel.
+    2. mode: str, the tensor parallel mode, should be in ['mtp', 'msp', 'fsp', 'isp'],
+        defaults to 'mtp', means the pure megatron tensor parallel without sequence parallel.
+        msp: megatron tensor parallel with sequence parallel, sequence parallel size = tensor parallel size.
+        fsp: tensor parallel by flash-attn with sequence parallel, sequence parallel size = tensor parallel size.
+        isp: customed intern sequence parallel without tensor parallel, can be used with weight parallel.
+pipeline parallel (dict):
+    1. size: int, the size of pipeline parallel.
+    2. interleaved_overlap: bool, enable/disable communication overlap when using interleaved pipeline scheduler,
+        defaults to False.
+weight parallel (dict):
+    1. size: int, the size of weight parallel.
+    2. overlap: bool, enable/disable all_gather/reduce_scatter communication overlap, defaults to False.
+    3. memory_pool: bool, enable/disable memory pool, defaults to False.
+"""
+parallel = dict(
+    zero1=dict(size=-1),
+    tensor=dict(size=1, mode="mtp"),
+    pipeline=dict(size=1, interleaved_overlap=True),
+    weight=dict(size=1, overlap=True, memory_pool=True),
+)
+
+cudnn_deterministic = False
+cudnn_benchmark = False
+
+monitor = dict(
+    # feishu alert configs
+    alert=dict(
+        enable_feishu_alert=DO_ALERT,
+        feishu_alert_address=None,  # feishu webhook to send alert message
+        light_monitor_address=None,  # light_monitor address to send heartbeat
+        alert_file_path=f"llm_alter/{JOB_NAME}_alert.log",
+    ),
+    tensorboard=dict(
+        queue_max_length=10,
+    ),
+)
+
+# metric_dtype can be "fp32" or other string
+# only when set to "fp32" will use fp32 to calc in metrics
+# metric_dtype = "fp32"
+
+generation = dict(
+    ckpt_folder="/path/to/saved/ckpt",
+    output_folder="/path/to/save/generation",
+    batch_size=1,
+    eos_id=[2, 0],
+    bos_id=1,
+    max_length=100,
+    do_sample=True,
+    temperature=1.0,
+    top_k=50,
+    top_p=1.0,
+    repetition_penalty=1,
+    length_penalty=1.0,
+)
diff --git a/internlm/data/build_dataloader.py b/internlm/data/build_dataloader.py
index e7f581dc..6937b8e4 100644
--- a/internlm/data/build_dataloader.py
+++ b/internlm/data/build_dataloader.py
@@ -44,7 +44,7 @@ def get_tokenized_train_loader_items(data_cfg):
         if data_cfg.get("is_multimodal", False):
             image_token_size = int(data_cfg.image_size // data_cfg.patch_size) ** 2
             train_ds = RandomDatasetMultimodal(
-                num_samples=100000,
+                num_samples=gpc.get_world_size(ParallelMode.DATA) * 500,
                 max_len=data_cfg.seq_len,
                 image_size=data_cfg.image_size,
                 image_token_size=image_token_size,
@@ -54,7 +54,9 @@ def get_tokenized_train_loader_items(data_cfg):
             )
         else:
             train_ds = RandomDataset(
-                num_samples=1000000, max_len=data_cfg.seq_len, fixed_seqlen=data_cfg.fixed_random_dataset_seqlen
+                num_samples=gpc.get_world_size(ParallelMode.DATA) * 500,
+                max_len=data_cfg.seq_len,
+                fixed_seqlen=data_cfg.fixed_random_dataset_seqlen,
             )
 
             if data_cfg.pack_sample_into_one:
diff --git a/internlm/model/modeling_baichuan2.py b/internlm/model/modeling_baichuan2.py
new file mode 100644
index 00000000..8674b811
--- /dev/null
+++ b/internlm/model/modeling_baichuan2.py
@@ -0,0 +1,639 @@
+# Copyright (c) InternLM. All rights reserved.
+import math
+import os
+from typing import Optional
+
+import torch
+from einops import rearrange
+from torch import nn
+from tqdm import tqdm
+
+from internlm.accelerator import get_accelerator
+from internlm.core.context import ParallelMode
+from internlm.core.context.parallel_context import global_context as gpc
+from internlm.initialize.initialize_tensor import (
+    normal_,
+    scaled_init_method_normal,
+    scaled_init_method_uniform,
+    uniform_,
+)
+from internlm.model.base_model import BaseModel
+from internlm.model.modules.embedding import Embedding1D
+from internlm.model.modules.linear import new_linear
+from internlm.model.modules.mha import MHA
+from internlm.model.modules.mlp import new_feed_forward
+from internlm.model.modules.norm import new_layer_norm
+from internlm.model.utils import (
+    convert_attn_args_to_kwargs,
+    convert_attn_kwargs_to_args,
+)
+from internlm.solver.activation_checkpoint import activation_checkpoint
+from internlm.utils.logger import get_logger
+from internlm.utils.storage_manager import get_fns, llm_load, llm_save
+from transformers.modeling_utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    shard_checkpoint,
+)
+
+internlm_accelerator = get_accelerator()
+logger = get_logger(__file__)
+
+
+class Baichuan2Decoder(nn.Module):
+    """
+    1D Packed Flash Llama Layer.
+
+    Args:
+        hidden_size (int): The hidden size of model. 768 by default.
+        num_attention_heads (int): The number of attention heads. 12 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0 by default.
+        drop_rate (float): The dropout rate of the input hidden state. 0.0 by default.
+        dtype (torch.dtype): Type of data. torch.float by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-5 by default.
+        checkpoint (bool): Whether to use checkpointing to save VRAM. True by default.
+        layer_idx (int): The index of current layer. 0 by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        norm_type (str): Use RMS norm or layernorm."rmsnorm" by default.
+        attn_wqkv_init_std (float): std used to init attn_wqkv weight. 0.006 by default,
+        attn_other_init_std (float): std used to init attn_other weight. 0.0015 by default,
+        ffn_uplayer_init_std (float): std used to init w1, w2 weight in ffn when using glu
+            otherwise init fc1 weight in ffn. 0.006 by default,
+        ffn_other_init_std (float): std used to init ffn_other weight. 0.0015 by default,
+        init_type (str): Initialization type. Use uniform or normal. "normal" by default,
+        rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
+        multiple_of (int): The value to make SwiGLU hidden layer size multiple of large power of 2.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_attention_heads: int = 12,
+        mlp_ratio: int = 4,
+        attn_drop_rate: float = 0,
+        drop_rate: float = 0.0,
+        dtype: torch.dtype = torch.float,
+        layer_norm_epsilon: float = 1e-6,
+        checkpoint: bool = False,
+        layer_idx: int = 0,
+        use_dynamic_ntk_rope: bool = False,
+        residual_in_fp32: bool = False,
+        device: Optional[torch.device] = None,
+        apply_post_layer_norm: bool = False,
+        fused_dropout_add_ln: bool = True,
+        no_bias: bool = False,
+        norm_type: str = "rmsnorm",
+        qk_interleaved: bool = False,
+        dropout_selective_checkpoint: bool = True,
+        use_scaled_init: bool = True,
+        use_swiglu: bool = True,
+        attn_wqkv_init_std: float = 0.006,
+        attn_other_init_std: float = 0.0015,
+        ffn_uplayer_init_std: float = 0.006,
+        ffn_other_init_std: float = 0.0015,
+        init_type: str = "normal",
+        rope_base: int = 10000,
+        mlp_layer_fusion: bool = False,
+        multiple_of: int = 256,
+        max_position_embeddings: int = 2048,
+    ):
+        super().__init__()
+        self.checkpoint = checkpoint
+        # dropout selective checkpoint can only be enabled when checkpoint is disabled.
+        self.dropout_selective_checkpoint = dropout_selective_checkpoint is True and checkpoint is False
+        self.layer_idx = layer_idx
+        self.prenorm = not apply_post_layer_norm
+        assert not fused_dropout_add_ln, "dropout_add_layer_norm can not be used here"
+        self.fused_dropout_add_ln = fused_dropout_add_ln
+        self.attn_wqkv_init_std = attn_wqkv_init_std
+        self.attn_other_init_std = attn_other_init_std
+        self.ffn_uplayer_init_std = ffn_uplayer_init_std
+        self.ffn_other_init_std = ffn_other_init_std
+
+        head_dim = hidden_size // num_attention_heads
+
+        self.attention = MHA(
+            embed_dim=hidden_size,
+            num_heads=num_attention_heads,
+            max_position_embeddings=max_position_embeddings,
+            bias=not no_bias,
+            dropout=attn_drop_rate,
+            softmax_scale=1 / math.sqrt(head_dim),
+            causal=True,
+            layer_idx=layer_idx,
+            use_dynamic_ntk_rope=use_dynamic_ntk_rope,
+            rope_base=rope_base,
+            rotary_emb_dim=head_dim,
+            rotary_emb_scale_base=0,
+            device=device,
+            dtype=dtype,
+            qk_interleaved=qk_interleaved,
+            enable_qkv_fusion=True,
+            out_bias=False,
+        )
+
+        self.dropout1 = nn.Dropout(drop_rate)
+        self.dropout2 = nn.Dropout(drop_rate)
+        self.attention_norm = new_layer_norm(norm_type, hidden_size, eps=layer_norm_epsilon)
+        self.ffn_norm = new_layer_norm(norm_type, hidden_size, eps=layer_norm_epsilon)
+
+        self.feed_forward = new_feed_forward(
+            hidden_size,
+            int(hidden_size * mlp_ratio),
+            out_features=hidden_size,
+            bias=False,
+            device=device,
+            dtype=dtype,
+            mlp_layer_fusion=mlp_layer_fusion,
+            multiple_of=multiple_of,
+            # TODO: to support more activation functions
+            activation_type="swiglu" if use_swiglu else "gelu",
+        )
+
+        self.use_swiglu = use_swiglu
+        self.use_scaled_init = use_scaled_init
+        self.residual_in_fp32 = residual_in_fp32  # only make sense when using prenorm
+        self.return_residual = False
+
+        if init_type == "normal":
+            self.init_func = normal_
+            self.scaled_init_func = scaled_init_method_normal
+        else:
+            self.init_func = uniform_
+            self.scaled_init_func = scaled_init_method_uniform
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        with torch.no_grad():
+            for name, param in self.attention.named_parameters():
+                if param.ndim == 1:
+                    param.data.zero_()
+                elif "wq" in name or "wk" in name or "wv" in name:
+                    self.init_func(std=self.attn_wqkv_init_std)(param.data)
+                elif self.use_scaled_init:  # wo
+                    self.scaled_init_func(sigma=self.attn_other_init_std, num_layers=self.layer_idx + 1)(param.data)
+                else:
+                    self.init_func(std=self.attn_other_init_std)(param.data)
+
+            for name, param in self.feed_forward.named_parameters():
+                if self.use_swiglu:
+                    if self.use_scaled_init and "w2" in name:
+                        self.scaled_init_func(sigma=self.ffn_other_init_std, num_layers=self.layer_idx + 1)(param.data)
+                    else:
+                        # candidate: w1, w3, fused_w1_w3
+                        self.init_func(
+                            std=self.ffn_uplayer_init_std if "w1" in name or "w3" in name else self.ffn_other_init_std
+                        )(param.data)
+                else:
+                    if self.use_scaled_init and "fc1" not in name:
+                        self.scaled_init_func(sigma=self.ffn_other_init_std, num_layers=self.layer_idx + 1)(param.data)
+                    else:
+                        self.init_func(std=self.ffn_uplayer_init_std if "fc1" in name else self.ffn_other_init_std)(
+                            param.data
+                        )
+
+    def forward(self, hidden_states, residual=None, **kwargs):
+        if self.checkpoint and self.training:
+            args = convert_attn_kwargs_to_args(kwargs)
+            return activation_checkpoint(self._forward, False, hidden_states, residual, *args)
+        else:
+            return self._forward(hidden_states, residual, **kwargs)
+
+    def _forward(self, hidden_states, residual, *args, **kwargs):
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Attn/MLP(LN(residual))
+            cu_seqlens: 1d LongTensor, len(cu_seqlens) = hidden_states + 1
+            indexes: the length of index is same as hidden states, which stand for the current position
+        """
+        if self.prenorm:
+
+            def _dropout_and_norm_attn(_residual, _hidden_states):
+                _dropped = self.dropout1(_hidden_states)
+                _residual = (_dropped + _residual) if _residual is not None else _dropped
+                _hidden_states = self.attention_norm(_residual.to(dtype=self.attention_norm.weight.dtype))
+
+                return _residual, _hidden_states
+
+            if self.dropout_selective_checkpoint:
+                residual, hidden_states = activation_checkpoint(_dropout_and_norm_attn, False, residual, hidden_states)
+            else:
+                residual, hidden_states = _dropout_and_norm_attn(residual, hidden_states)
+
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+            mixer_kwargs = convert_attn_args_to_kwargs(args, kwargs)
+            hidden_states = self.attention(hidden_states, **mixer_kwargs)
+
+            if not isinstance(self.feed_forward, nn.Identity):
+                if not self.fused_dropout_add_ln:
+
+                    def _dropout_and_norm_ffn(_residual, _hidden_states):
+                        _dropped = self.dropout2(_hidden_states)
+                        _residual = (_dropped + _residual) if _residual is not None else _dropped
+                        _hidden_states = self.ffn_norm(_residual.to(self.ffn_norm.weight.dtype))
+
+                        return _residual, _hidden_states
+
+                    if self.dropout_selective_checkpoint:
+                        residual, hidden_states = activation_checkpoint(
+                            _dropout_and_norm_ffn, False, residual, hidden_states
+                        )
+                    else:
+                        residual, hidden_states = _dropout_and_norm_ffn(residual, hidden_states)
+
+                    if self.residual_in_fp32:
+                        residual = residual.to(torch.float32)
+                hidden_states = self.feed_forward(hidden_states)
+
+            return hidden_states + residual
+        else:
+            assert residual is None
+
+            mixer_out = self.attention(hidden_states, **kwargs)
+            if self.return_residual:  # mixer out is actually a pair here
+                mixer_out, hidden_states = mixer_out
+            hidden_states = self.attention_norm(self.dropout1(mixer_out) + hidden_states).to(
+                dtype=self.attention_norm.weight.dtype
+            )
+            if not isinstance(self.feed_forward, nn.Identity):
+                mlp_out = self.feed_forward(hidden_states)
+                if self.return_residual:  # mlp out is actually a pair here
+                    mlp_out, hidden_states = mlp_out
+                hidden_states = self.ffn_norm((self.dropout2(mlp_out)) + hidden_states).to(
+                    dtype=self.ffn_norm.weight.dtype
+                )
+            return hidden_states
+
+
+class Baichuan2(BaseModel):
+    """
+    1D Packed Flash Llama.
+
+    Args:
+        num_layers (int): The number of layer. 12 by default.
+        hidden_size (int): The size of hidden state. 768 by default.
+        num_attention_heads (int): The number of attention head. 12 by default.
+        vocab_size (int): The size of vocabulary. 50304 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0.0 by default.
+        drop_rate (float): The dropout rate of input hidden state. 0.0 by default.
+        dtype (torch.dtype): The type of data. torch.float by default.
+        checkpoint (bool): Whether to use checkpointing to save VRAM. True by default.
+        checkpoint_fraction (float): The proportion of layers that need to be checkpointed compared to the total number
+                                    of layers. 1.0 by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-6 by default.
+        first (bool): Whether input embedding layer or not. False by default.
+        last (bool): Whether output embedding layer or not. False by default.
+        embed_grad_scale (float): Refer to GLM-130B, for training stability. 0.1 by default.
+        parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default.
+        start_layer_idx (int): The index of start layer in the pipeline. 0 by default.
+        device (Optional[Union[str, torch.device]]): The device will be used. None by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
+        norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
+        qk_interleaved (bool): Whether the odd and even columns of the wq and wk are normally interleaved.
+        embedding_init_std (float): std used to init embedding weight. 0.0052 by default,
+        attn_wqkv_init_std (float): std used to init attn_wqkv weight. 0.006 by default,
+        attn_other_init_std (float): std used to init attn_other weight. 0.0015 by default,
+        ffn_uplayer_init_std (float): std used to init w1, w2 weight in ffn when using glu
+            otherwise init fc1 weight in ffn. 0.006 by default,
+        ffn_other_init_std (float): std used to init ffn_other weight. 0.0015 by default,
+        out_head_init_std (float): std used to init output lmhead weight. 0.0052 by default,
+        init_type (str): Initialization type. Use uniform or normal. "normal" by default,
+        rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
+        multiple_of (int): The value to make SwiGLU hidden layer size multiple of large power of 2.
+    """
+
+    def __init__(
+        self,
+        num_layers: int = 12,
+        hidden_size: int = 768,
+        num_attention_heads: int = 12,
+        vocab_size: int = 50304,
+        mlp_ratio: int = 4,
+        attn_drop_rate: float = 0.0,
+        drop_rate: float = 0.0,
+        max_position_embeddings: int = 2048,
+        dtype: torch.dtype = torch.float,
+        checkpoint: float = 1.0,
+        layer_norm_epsilon: float = 1e-5,
+        first: bool = False,
+        last: bool = False,
+        embed_grad_scale: float = 0.1,
+        parallel_output: bool = True,
+        start_layer_idx: int = 0,
+        use_dynamic_ntk_rope: bool = False,
+        device: Optional[torch.device] = None,
+        apply_post_layer_norm=False,
+        no_bias=False,
+        residual_in_fp32: bool = False,
+        norm_type: str = "rmsnorm",
+        qk_interleaved: bool = False,
+        is_reward: bool = False,
+        dropout_selective_checkpoint: bool = True,
+        use_scaled_init: bool = True,
+        use_swiglu: bool = True,
+        embedding_init_std: float = 0.0052,
+        attn_wqkv_init_std: float = 0.006,
+        attn_other_init_std: float = 0.0015,
+        ffn_uplayer_init_std: float = 0.006,
+        ffn_other_init_std: float = 0.0015,
+        out_head_init_std: float = 0.0052,
+        init_type: str = "normal",
+        norm_head: bool = False,
+        rope_base: int = 10000,
+        mlp_layer_fusion: bool = False,
+        multiple_of: int = 256,
+    ):
+        super().__init__()
+
+        checkpoint_layer_num = int(num_layers * checkpoint)
+        self.embed_grad_scale = embed_grad_scale
+        self.parallel_output = parallel_output
+
+        if first:
+            self.tok_embeddings = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
+
+            for _, param in self.tok_embeddings.named_parameters():
+                if init_type == "normal":
+                    normal_(std=embedding_init_std)(param)
+                else:
+                    uniform_(std=embedding_init_std)(param)
+
+        self.layers = nn.ModuleList(
+            [
+                Baichuan2Decoder(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    max_position_embeddings=max_position_embeddings,
+                    dtype=dtype,
+                    layer_norm_epsilon=layer_norm_epsilon,
+                    checkpoint=lid < checkpoint_layer_num,
+                    layer_idx=lid + start_layer_idx,  # This parameter is used for caching during generation
+                    use_dynamic_ntk_rope=use_dynamic_ntk_rope,
+                    residual_in_fp32=residual_in_fp32,
+                    device=device,
+                    apply_post_layer_norm=apply_post_layer_norm,
+                    fused_dropout_add_ln=False,
+                    no_bias=no_bias,
+                    norm_type=norm_type,
+                    dropout_selective_checkpoint=dropout_selective_checkpoint,
+                    use_scaled_init=use_scaled_init,
+                    use_swiglu=use_swiglu,
+                    qk_interleaved=qk_interleaved,
+                    attn_wqkv_init_std=attn_wqkv_init_std,
+                    attn_other_init_std=attn_other_init_std,
+                    ffn_uplayer_init_std=ffn_uplayer_init_std,
+                    ffn_other_init_std=ffn_other_init_std,
+                    init_type=init_type,
+                    rope_base=rope_base,
+                    mlp_layer_fusion=mlp_layer_fusion,
+                    multiple_of=multiple_of,
+                )
+                for lid in range(num_layers)
+            ]
+        )
+
+        if last:
+            if not apply_post_layer_norm:
+                self.norm = new_layer_norm(norm_type, hidden_size, eps=layer_norm_epsilon)
+
+            self.output = new_linear(
+                name="output",
+                in_features=hidden_size,
+                out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size,
+                bias=False,
+                device=device,
+                dtype=dtype,
+                is_reward=is_reward,
+                weight_scale=embed_grad_scale,
+                norm_head=norm_head,
+            )
+
+            for _, param in self.output.named_parameters():
+                if init_type == "normal":
+                    normal_(std=out_head_init_std)(param)
+                else:
+                    uniform_(std=out_head_init_std)(param)
+
+    def forward(self, hidden_states=None, input_ids=None, **kwargs):
+        # attention_mask: compute attention on the places where the value is 1
+        if hasattr(self, "tok_embeddings") and input_ids is not None:
+            hidden_states = self.tok_embeddings(input_ids)
+            if self.embed_grad_scale != 1:
+                hidden_states = (
+                    self.embed_grad_scale * hidden_states + (1 - self.embed_grad_scale) * hidden_states.detach()
+                )
+
+        for _, block in enumerate(self.layers):
+            hidden_states = block(hidden_states, residual=None, **kwargs)
+
+        if hasattr(self, "norm"):
+            hidden_states = self.norm(hidden_states.to(self.norm.weight.dtype))
+        if hasattr(self, "output"):
+            hidden_states = self.output(hidden_states)
+
+        return hidden_states
+
+    @staticmethod
+    def load_hf_weights(folder: str, model: nn.Module) -> None:
+        assert folder is not None, "Please specify the folder of the pretrained model"
+        if gpc.is_rank_for_log():
+            logger.info(f"Loading pretrained model from {folder}")
+
+        fns = get_fns(folder)
+        model_fns = [
+            os.path.join(folder, fn)
+            for fn in fns
+            if (fn.endswith(".bin") and fn.startswith("pytorch_model"))
+            or (fn.endswith(".safetensors") and fn.startswith("model"))
+        ]
+        model_fns.sort()
+
+        state_dict = {}
+        for model_fn in model_fns:
+            state_dict.update(llm_load(model_fn, map_location="cpu"))
+
+        tp_size = gpc.get_world_size(ParallelMode.TENSOR)
+        tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+        wp_size = gpc.get_world_size(ParallelMode.WEIGHT)
+        wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT)
+        tp_mode = gpc.config.parallel.tensor["mode"]
+        split_size = wp_size if tp_mode == "isp" else tp_size
+        local_rank = wp_rank if tp_mode == "isp" else tp_rank
+        row_dim = 0 if tp_mode == "isp" else 1
+        if gpc.config.model.get("embed_split_hidden", True):
+            embed_concat_dim = 1
+        else:
+            embed_concat_dim = 0
+
+        new_state_dict = {}
+
+        # embedding
+        if (gpc.get_local_rank(ParallelMode.PIPELINE) == 0) or (not gpc.is_using_parallel_mode(ParallelMode.PIPELINE)):
+            new_state_dict["tok_embeddings.weight"] = torch.chunk(
+                state_dict.pop("model.embed_tokens.weight"),
+                split_size,
+                dim=embed_concat_dim,
+            )[local_rank]
+
+        for idx, i in enumerate(range(model.first_layer, model.last_layer)):
+            layer_ids = i
+
+            # attn
+            state_dict[f"layers.{i}.attention.wqkv.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.W_pack.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.attention.out_proj.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.o_proj.weight"),
+                split_size,
+                dim=row_dim,
+            )[local_rank]
+
+            # ffn
+            state_dict[f"layers.{i}.feed_forward.w1.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.mlp.gate_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.feed_forward.w3.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.mlp.up_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.feed_forward.w2.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.mlp.down_proj.weight"),
+                split_size,
+                dim=row_dim,
+            )[local_rank]
+
+            # attn norm
+            state_dict[f"layers.{i}.attention_norm.weight"] = state_dict.pop(
+                f"model.layers.{layer_ids}.input_layernorm.weight"
+            )
+            # ffn norm
+            state_dict[f"layers.{i}.ffn_norm.weight"] = state_dict.pop(
+                f"model.layers.{layer_ids}.post_attention_layernorm.weight"
+            )
+
+            # replace value within decoder layer
+            for name in list(state_dict.keys()):
+                if name.startswith(f"layers.{i}"):
+                    new_state_dict[name.replace(f".{i}.", f".{idx}.")] = state_dict.pop(name)
+
+        # output
+        if gpc.is_last_rank(ParallelMode.PIPELINE):
+            new_state_dict["output.weight"] = torch.chunk(
+                state_dict.pop("lm_head.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            new_state_dict["norm.weight"] = state_dict.pop("model.norm.weight")
+
+        missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+        if len(state_dict) > 0:
+            logger.warning(f"Be cautious, checkpoint state_dict keys={state_dict.keys()} have not beed loaded.")
+
+        if gpc.get_local_rank(ParallelMode.DATA) == 0:
+            pp_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(ParallelMode.PIPELINE)
+            logger.info(
+                f"Missing keys:{missing_keys}, unexpected keys:{unexpected_keys} in "
+                f"tp:{gpc.get_local_rank(ParallelMode.TENSOR)}, pp:{pp_rank}"
+            )
+
+        internlm_accelerator.empty_cache()
+
+    @staticmethod
+    def convert_internevo2hf_weights(src: str, tgt: str) -> None:
+        def permute(qkv, num_heads, num_kv_heads, head_dim, qk_interleaved=False):
+            if not qk_interleaved:
+                return qkv
+            q_per_kv = num_heads // num_kv_heads
+            qkv = rearrange(qkv.T, "o (g n i) -> o g n i", n=q_per_kv + 2, i=head_dim)
+            q, k, v = qkv[..., :q_per_kv, :], qkv[..., -2:-1, :], qkv[..., -1:, :]
+            q = torch.cat([q[..., ::2], q[..., 1::2]], dim=-1)
+            k = torch.cat([k[..., ::2], k[..., 1::2]], dim=-1)
+            qkv = torch.cat((q, k, v), dim=2)
+            qkv = rearrange(qkv, "o g n i -> o (g n i)").T
+            return qkv
+
+        model_config = gpc.config.model
+        tp_mode = gpc.config.parallel.tensor["mode"]
+        row_dim = 0 if tp_mode == "isp" else 1
+        if model_config["embed_split_hidden"]:
+            embed_concat_dim = 1
+        else:
+            embed_concat_dim = 0
+
+        # load states
+        states, num_shards = Baichuan2.load_sharded_states(src)
+
+        # convert state_dict
+        state_dict = {}
+        embedding_key_list = ["tok_embeddings.weight", "embed_tokens.weight", None]
+        for layer_i in tqdm(range(model_config["num_layers"])):
+            # attn norm, ffn norm
+            state_dict.update(
+                {
+                    f"model.layers.{layer_i}.input_layernorm.weight": states[0][
+                        f"layers.{layer_i}.attention_norm.weight"
+                    ].clone(),
+                    f"model.layers.{layer_i}.post_attention_layernorm.weight": states[0][
+                        f"layers.{layer_i}.ffn_norm.weight"
+                    ].clone(),
+                }
+            )
+            # attn
+            state_dict[f"model.layers.{layer_i}.self_attn.W_pack.weight"] = permute(
+                torch.cat([states[i][f"layers.{layer_i}.attention.wqkv.weight"] for i in range(num_shards)], dim=0),
+                num_heads=model_config["num_attention_heads"],
+                # num_kv_attention_heads equals to num_attention_heads in MHA
+                num_kv_heads=model_config["num_attention_heads"],
+                head_dim=model_config["hidden_size"] // model_config["num_attention_heads"],
+                qk_interleaved=model_config.get("qk_interleaved", False),
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.out_proj.weight"] for i in range(num_shards)], dim=row_dim
+            )
+            # ffn
+            state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=row_dim
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+            )
+        # embedding, output
+        for embedding_key in embedding_key_list:
+            if embedding_key in states[0]:
+                break
+        if embedding_key is None:
+            raise KeyError("Cannot find embedding key!")
+        state_dict.update(
+            {
+                "model.norm.weight": states[0]["norm.weight"],
+                "model.embed_tokens.weight": torch.cat(
+                    [states[i][embedding_key] for i in range(num_shards)], dim=embed_concat_dim
+                ),
+                "lm_head.weight": torch.cat([states[i]["output.weight"] for i in range(num_shards)], dim=0),
+            },
+        )
+
+        # save state_dict to hf format
+        shards, index = shard_checkpoint(state_dict, weights_name=SAFE_WEIGHTS_NAME)
+        for shard_file, shard in shards.items():
+            llm_save(save_path=os.path.join(tgt, shard_file), saved_obj=shard, metadata={"format": "pt"})
+        if index is not None:
+            llm_save(save_path=os.path.join(tgt, SAFE_WEIGHTS_INDEX_NAME), saved_obj=index)
diff --git a/internlm/model/modeling_gemma.py b/internlm/model/modeling_gemma.py
new file mode 100644
index 00000000..a43843a8
--- /dev/null
+++ b/internlm/model/modeling_gemma.py
@@ -0,0 +1,752 @@
+# Copyright (c) InternLM. All rights reserved.
+import math
+import os
+from typing import Optional
+
+import torch
+from torch import nn
+from tqdm import tqdm
+
+from internlm.accelerator import get_accelerator
+from internlm.core.context import ParallelMode
+from internlm.core.context.parallel_context import global_context as gpc
+from internlm.initialize.initialize_tensor import (
+    normal_,
+    scaled_init_method_normal,
+    scaled_init_method_uniform,
+    uniform_,
+)
+from internlm.model.base_model import BaseModel
+from internlm.model.modules.embedding import Embedding1D
+from internlm.model.modules.linear import new_linear
+from internlm.model.modules.mha import GQA
+from internlm.model.modules.mlp import new_feed_forward
+from internlm.model.modules.norm import new_layer_norm
+from internlm.model.utils import (
+    convert_attn_args_to_kwargs,
+    convert_attn_kwargs_to_args,
+)
+from internlm.solver.activation_checkpoint import activation_checkpoint
+from internlm.utils.logger import get_logger
+from internlm.utils.storage_manager import get_fns, llm_load, llm_save
+from transformers.modeling_utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    shard_checkpoint,
+)
+
+try:
+    from flash_attn.modules.mlp import ParallelFusedMLP
+except ImportError:
+    pass
+
+internlm_accelerator = get_accelerator()
+logger = get_logger(__file__)
+
+
+class GemmaDecoder(nn.Module):
+    """
+    1D Packed Flash Llama Layer.
+
+    Args:
+        hidden_size (int): The hidden size of model. 768 by default.
+        num_attention_heads (int): The number of attention heads. 12 by default.
+        head_dim (int): The dimention of attention head dimention. hidden_size divided by num_heads by default.
+        mlp_ratio (int): The ratio of MLP layers. 4 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0 by default.
+        drop_rate (float): The dropout rate of the input hidden state. 0.0 by default.
+        dtype (torch.dtype): Type of data. torch.float by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-5 by default.
+        checkpoint (bool): Whether to use checkpointing to save VRAM. True by default.
+        layer_idx (int): The index of current layer. 0 by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        add_unit_offset(bool): Add one to RMSNorm weight multiply by normed input. False by default.
+        use_glu (bool): Whether to use glu. True by default.
+        use_swiglu (bool): Whether to use swiglu. True by default.
+        attn_wqkv_init_std (float): std used to init attn_wqkv weight. 0.02 by default,
+        attn_other_init_std (float): std used to init attn_other weight. 0.02 by default,
+        ffn_uplayer_init_std (float): std used to init w1, w2 weight in ffn when using glu
+            otherwise init fc1 weight in ffn. 0.02 by default,
+        ffn_other_init_std (float): std used to init ffn_other weight. 0.02 by default,
+        init_type (str): Initialization type. Use uniform or normal. "normal" by default,
+        rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
+        multiple_of (int): The value to make SwiGLU hidden layer size multiple of large power of 2.
+        tp_mode (str): The string value of tensor parallel mode, should be in ["mtp", "msp", "fsp", "isp"],
+                       "mtp" by default.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_attention_heads: int = 12,
+        num_kv_attention_heads: int = 12,
+        head_dim: int = None,
+        mlp_ratio: int = 4,
+        attn_drop_rate: float = 0,
+        drop_rate: float = 0.0,
+        max_position_embeddings: int = 2048,
+        dtype: torch.dtype = torch.float,
+        layer_norm_epsilon: float = 1e-6,
+        checkpoint: bool = False,
+        layer_idx: int = 0,
+        use_dynamic_ntk_rope: bool = False,
+        residual_in_fp32: bool = False,
+        device: Optional[torch.device] = None,
+        apply_post_layer_norm: bool = False,
+        fused_dropout_add_ln: bool = True,
+        no_bias: bool = False,
+        norm_type: str = "rmsnorm",
+        qk_interleaved: bool = False,
+        add_unit_offset: bool = False,
+        dropout_selective_checkpoint: bool = True,
+        use_scaled_init: bool = True,
+        use_glu: bool = True,
+        use_swiglu: bool = True,
+        attn_wqkv_init_std: float = 0.02,
+        attn_other_init_std: float = 0.02,
+        ffn_uplayer_init_std: float = 0.02,
+        ffn_other_init_std: float = 0.02,
+        init_type: str = "normal",
+        rope_base: int = 10000,
+        mlp_layer_fusion: bool = False,
+        multiple_of: int = 256,
+        tp_mode: str = "mtp",
+    ):
+        super().__init__()
+        self.checkpoint = checkpoint
+        # dropout selective checkpoint can only be enabled when checkpoint is disabled.
+        self.dropout_selective_checkpoint = dropout_selective_checkpoint is True and checkpoint is False
+        self.layer_idx = layer_idx
+        self.prenorm = not apply_post_layer_norm
+        assert not fused_dropout_add_ln, "dropout_add_layer_norm can not be used here"
+        self.fused_dropout_add_ln = fused_dropout_add_ln
+        self.attn_wqkv_init_std = attn_wqkv_init_std
+        self.attn_other_init_std = attn_other_init_std
+        self.ffn_uplayer_init_std = ffn_uplayer_init_std
+        self.ffn_other_init_std = ffn_other_init_std
+
+        if not head_dim:
+            head_dim = hidden_size // num_attention_heads
+
+        self.attention = GQA(
+            embed_dim=hidden_size,
+            num_heads=num_attention_heads,
+            num_kv_heads=num_kv_attention_heads,
+            head_dim=head_dim,
+            dropout=attn_drop_rate,
+            max_position_embeddings=max_position_embeddings,
+            softmax_scale=1 / math.sqrt(head_dim),
+            causal=True,
+            layer_idx=layer_idx,
+            use_dynamic_ntk_rope=use_dynamic_ntk_rope,
+            rotary_emb_dim=head_dim,
+            rotary_emb_scale_base=0,
+            device=device,
+            dtype=dtype,
+            qk_interleaved=qk_interleaved,
+            bias=not no_bias,
+            rope_base=rope_base,
+            enable_qkv_fusion=False,
+        )
+
+        self.dropout1 = nn.Dropout(drop_rate)
+        self.dropout2 = nn.Dropout(drop_rate)
+        self.attention_norm = new_layer_norm(
+            norm_type, hidden_size, eps=layer_norm_epsilon, add_unit_offset=add_unit_offset
+        )
+        self.ffn_norm = new_layer_norm(norm_type, hidden_size, eps=layer_norm_epsilon, add_unit_offset=add_unit_offset)
+
+        sequence_parallel = gpc.config.parallel.get("sequence_parallel", False)
+        parallel_mode = ParallelMode.WEIGHT if tp_mode == "isp" else ParallelMode.TENSOR
+
+        if use_glu:
+            self.feed_forward = new_feed_forward(
+                hidden_size,
+                int(hidden_size * mlp_ratio),
+                out_features=hidden_size,
+                bias=False,
+                device=device,
+                dtype=dtype,
+                mlp_layer_fusion=mlp_layer_fusion,
+                multiple_of=multiple_of,
+                activation_type="swiglu" if use_swiglu else "gelu",
+            )
+        else:
+            self.feed_forward = ParallelFusedMLP(
+                hidden_size,
+                int(hidden_size * mlp_ratio),
+                out_features=hidden_size,
+                activation="gelu_approx",
+                process_group=gpc.get_group(parallel_mode),
+                bias1=False,
+                bias2=False,
+                sequence_parallel=sequence_parallel,
+                checkpoint_lvl=0,
+                heuristic="auto",
+                device=device,
+                dtype=dtype,
+            )
+
+        self.use_glu = use_glu
+        self.use_swiglu = use_swiglu
+        self.use_scaled_init = use_scaled_init
+        self.residual_in_fp32 = residual_in_fp32  # only make sense when using prenorm
+        self.return_residual = False
+
+        if init_type == "normal":
+            self.init_func = normal_
+            self.scaled_init_func = scaled_init_method_normal
+        else:
+            self.init_func = uniform_
+            self.scaled_init_func = scaled_init_method_uniform
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        with torch.no_grad():
+            for name, param in self.attention.named_parameters():
+                if param.ndim == 1:
+                    param.data.zero_()
+                elif "wq" in name or "wk" in name or "wv" in name:
+                    self.init_func(std=self.attn_wqkv_init_std)(param.data)
+                elif self.use_scaled_init:  # wo
+                    self.scaled_init_func(sigma=self.attn_other_init_std, num_layers=self.layer_idx + 1)(param.data)
+                else:
+                    self.init_func(std=self.attn_other_init_std)(param.data)
+
+            for name, param in self.feed_forward.named_parameters():
+                if self.use_glu:
+                    if self.use_scaled_init and "w2" in name:
+                        self.scaled_init_func(sigma=self.ffn_other_init_std, num_layers=self.layer_idx + 1)(param.data)
+                    else:
+                        self.init_func(
+                            std=self.ffn_uplayer_init_std if "w1" in name or "w3" in name else self.ffn_other_init_std
+                        )(param.data)
+                else:
+                    if self.use_scaled_init and "fc1" not in name:
+                        self.scaled_init_func(sigma=self.ffn_other_init_std, num_layers=self.layer_idx + 1)(param.data)
+                    else:
+                        self.init_func(std=self.ffn_uplayer_init_std if "fc1" in name else self.ffn_other_init_std)(
+                            param.data
+                        )
+
+    def forward(self, hidden_states, residual=None, **kwargs):
+        if self.checkpoint and self.training:
+            args = convert_attn_kwargs_to_args(kwargs)
+            return activation_checkpoint(self._forward, False, hidden_states, residual, *args)
+        else:
+            return self._forward(hidden_states, residual, **kwargs)
+
+    def _forward(self, hidden_states, residual, *args, **kwargs):
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Attn/MLP(LN(residual))
+            cu_seqlens: 1d LongTensor, len(cu_seqlens) = hidden_states + 1
+            indexes: the length of index is same as hidden states, which stand for the current position
+        """
+        if self.prenorm:
+
+            def _dropout_and_norm_attn(_residual, _hidden_states):
+                _dropped = self.dropout1(_hidden_states)
+                _residual = (_dropped + _residual) if _residual is not None else _dropped
+                _hidden_states = self.attention_norm(_residual.to(dtype=self.attention_norm.weight.dtype))
+
+                return _residual, _hidden_states
+
+            if self.dropout_selective_checkpoint:
+                residual, hidden_states = activation_checkpoint(_dropout_and_norm_attn, False, residual, hidden_states)
+            else:
+                residual, hidden_states = _dropout_and_norm_attn(residual, hidden_states)
+
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+
+            mixer_kwargs = convert_attn_args_to_kwargs(args, kwargs)
+            hidden_states = self.attention(hidden_states, **mixer_kwargs)
+
+            if not isinstance(self.feed_forward, nn.Identity):
+                if not self.fused_dropout_add_ln:
+
+                    def _dropout_and_norm_ffn(_residual, _hidden_states):
+                        _dropped = self.dropout2(_hidden_states)
+                        _residual = (_dropped + _residual) if _residual is not None else _dropped
+                        _hidden_states = self.ffn_norm(_residual.to(self.ffn_norm.weight.dtype))
+
+                        return _residual, _hidden_states
+
+                    if self.dropout_selective_checkpoint:
+                        residual, hidden_states = activation_checkpoint(
+                            _dropout_and_norm_ffn, False, residual, hidden_states
+                        )
+                    else:
+                        residual, hidden_states = _dropout_and_norm_ffn(residual, hidden_states)
+
+                    if self.residual_in_fp32:
+                        residual = residual.to(torch.float32)
+                hidden_states = self.feed_forward(hidden_states)
+
+            return hidden_states + residual
+        else:
+            assert residual is None
+
+            mixer_out = self.attention(hidden_states, **kwargs)
+            if self.return_residual:  # mixer out is actually a pair here
+                mixer_out, hidden_states = mixer_out
+            hidden_states = self.attention_norm(self.dropout1(mixer_out) + hidden_states).to(
+                dtype=self.attention_norm.weight.dtype
+            )
+            if not isinstance(self.feed_forward, nn.Identity):
+                mlp_out = self.feed_forward(hidden_states)
+                if self.return_residual:  # mlp out is actually a pair here
+                    mlp_out, hidden_states = mlp_out
+                hidden_states = self.ffn_norm((self.dropout2(mlp_out)) + hidden_states).to(
+                    dtype=self.ffn_norm.weight.dtype
+                )
+            return hidden_states
+
+
+class Gemma(BaseModel):
+    """
+    1D Packed Flash Llama.
+
+    Args:
+        num_layers (int): The number of layer. 12 by default.
+        hidden_size (int): The size of hidden state. 768 by default.
+        num_attention_heads (int): The number of attention head. 12 by default.
+        head_dim (int): The dimention of attention head dimention. hidden_size divided by num_heads by default.
+        vocab_size (int): The size of vocabulary. 50304 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0.0 by default.
+        drop_rate (float): The dropout rate of input hidden state. 0.0 by default.
+        dtype (torch.dtype): The type of data. torch.float by default.
+        checkpoint (bool): Whether to use checkpointing to save VRAM. True by default.
+        checkpoint_fraction (float): The proportion of layers that need to be checkpointed compared to the total number
+                                    of layers. 1.0 by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-6 by default.
+        first (bool): Whether input embedding layer or not. False by default.
+        last (bool): Whether output embedding layer or not. False by default.
+        embed_grad_scale (float): Refer to GLM-130B, for training stability. 0.1 by default.
+        parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default.
+        start_layer_idx (int): The index of start layer in the pipeline. 0 by default.
+        device (Optional[Union[str, torch.device]]): The device will be used. None by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
+        add_unit_offset(bool): Add one to RMSNorm weight multiply by normed input. False by default.
+        use_glu (bool): Whether to use glu. True by default.
+        use_swiglu (bool): Whether to use swiglu. True by default.
+        embedding_init_std (float): std used to init embedding weight. 0.02 by default,
+        attn_wqkv_init_std (float): std used to init attn_wqkv weight. 0.02 by default,
+        attn_other_init_std (float): std used to init attn_other weight. 0.02 by default,
+        ffn_uplayer_init_std (float): std used to init w1, w2 weight in ffn when using glu
+            otherwise init fc1 weight in ffn. 0.02 by default,
+        ffn_other_init_std (float): std used to init ffn_other weight. 0.02 by default,
+        out_head_init_std (float): std used to init output lmhead weight. 0.02 by default,
+        init_type (str): Initialization type. Use uniform or normal. "normal" by default,
+        extra_pred_tokens (int): The number of extra output head for multi-token-prediction. 0 by default.
+        rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
+        multiple_of (int): The value to make SwiGLU hidden layer size multiple of large power of 2.
+    """
+
+    def __init__(
+        self,
+        num_layers: int = 12,
+        hidden_size: int = 768,
+        num_attention_heads: int = 12,
+        num_kv_attention_heads: int = 12,
+        head_dim: int = None,
+        vocab_size: int = 50304,
+        mlp_ratio: int = 4,
+        attn_drop_rate: float = 0.0,
+        drop_rate: float = 0.0,
+        max_position_embeddings: int = 2048,
+        dtype: torch.dtype = torch.float,
+        checkpoint: float = 1.0,
+        layer_norm_epsilon: float = 1e-5,
+        first: bool = False,
+        last: bool = False,
+        embed_grad_scale: float = 0.1,
+        parallel_output: bool = True,
+        start_layer_idx: int = 0,
+        use_dynamic_ntk_rope: bool = False,
+        device: Optional[torch.device] = None,
+        apply_post_layer_norm=False,
+        no_bias=False,
+        residual_in_fp32: bool = False,
+        norm_type: str = "rmsnorm",
+        qk_interleaved: bool = False,
+        add_unit_offset: bool = False,
+        is_reward: bool = False,
+        dropout_selective_checkpoint: bool = True,
+        use_scaled_init: bool = True,
+        use_glu: bool = True,
+        use_swiglu: bool = False,
+        embedding_init_std: float = 0.02,
+        attn_wqkv_init_std: float = 0.02,
+        attn_other_init_std: float = 0.02,
+        ffn_uplayer_init_std: float = 0.02,
+        ffn_other_init_std: float = 0.02,
+        out_head_init_std: float = 0.02,
+        init_type: str = "normal",
+        extra_pred_tokens: int = 0,
+        rope_base: int = 10000,
+        norm_head: bool = False,
+        mlp_layer_fusion: bool = False,
+        multiple_of: int = 256,
+    ):
+        super().__init__()
+
+        checkpoint_layer_num = int(num_layers * checkpoint)
+        self.hidden_size = hidden_size
+        self.embed_grad_scale = embed_grad_scale
+        self.parallel_output = parallel_output
+        self.tp_mode = "mtp"
+        if isinstance(gpc.config.parallel["tensor"], dict):
+            self.tp_mode = gpc.config.parallel["tensor"].get("mode", "mtp")
+
+        if first:
+            self.embed_tokens = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
+            for _, param in self.embed_tokens.named_parameters():
+                if init_type == "normal":
+                    normal_(std=embedding_init_std)(param)
+                else:
+                    uniform_(std=embedding_init_std)(param)
+
+        self.layers = nn.ModuleList(
+            [
+                GemmaDecoder(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    num_kv_attention_heads=num_kv_attention_heads,
+                    head_dim=head_dim,
+                    mlp_ratio=mlp_ratio,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    max_position_embeddings=max_position_embeddings,
+                    dtype=dtype,
+                    layer_norm_epsilon=layer_norm_epsilon,
+                    checkpoint=lid < checkpoint_layer_num,
+                    layer_idx=lid + start_layer_idx,  # This parameter is used for caching during generation
+                    use_dynamic_ntk_rope=use_dynamic_ntk_rope,
+                    residual_in_fp32=residual_in_fp32,
+                    device=device,
+                    apply_post_layer_norm=apply_post_layer_norm,
+                    fused_dropout_add_ln=False,
+                    no_bias=no_bias,
+                    norm_type=norm_type,
+                    add_unit_offset=add_unit_offset,
+                    dropout_selective_checkpoint=dropout_selective_checkpoint,
+                    use_scaled_init=use_scaled_init,
+                    use_glu=use_glu,
+                    use_swiglu=use_swiglu,
+                    qk_interleaved=qk_interleaved,
+                    attn_wqkv_init_std=attn_wqkv_init_std,
+                    attn_other_init_std=attn_other_init_std,
+                    ffn_uplayer_init_std=ffn_uplayer_init_std,
+                    ffn_other_init_std=ffn_other_init_std,
+                    init_type=init_type,
+                    rope_base=rope_base,
+                    mlp_layer_fusion=mlp_layer_fusion,
+                    multiple_of=multiple_of,
+                    tp_mode=self.tp_mode,
+                )
+                for lid in range(num_layers)
+            ]
+        )
+
+        if last:
+            if not apply_post_layer_norm:
+                self.norm = new_layer_norm(
+                    norm_type, hidden_size, eps=layer_norm_epsilon, add_unit_offset=add_unit_offset
+                )
+
+            self.output = new_linear(
+                name="output",
+                in_features=hidden_size,
+                out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size,
+                bias=False,
+                device=device,
+                is_reward=is_reward,
+                dtype=dtype,
+                weight_scale=embed_grad_scale,
+                norm_head=norm_head,
+            )
+            for _, param in self.output.named_parameters():
+                if init_type == "normal":
+                    normal_(std=out_head_init_std)(param)
+                else:
+                    uniform_(std=out_head_init_std)(param)
+
+            if extra_pred_tokens > 0:
+                self.extra_pred_tokens = extra_pred_tokens
+                assert not is_reward, "extra_pred_tokens > 0 means using multi token prediction, not implement for RLHF"
+                self.extra_outputs = nn.ModuleList(
+                    [
+                        new_linear(
+                            name="output",
+                            in_features=hidden_size,
+                            out_features=vocab_size,
+                            bias=False,
+                            device=device,
+                            is_reward=is_reward,
+                            dtype=dtype,
+                            weight_scale=embed_grad_scale,
+                            norm_head=norm_head,
+                        )
+                        for _ in range(self.extra_pred_tokens)
+                    ]
+                )
+                for _, param in self.extra_outputs.named_parameters():
+                    if init_type == "normal":
+                        normal_(std=out_head_init_std)(param)
+                    else:
+                        uniform_(std=out_head_init_std)(param)
+
+    def forward(self, hidden_states=None, input_ids=None, **kwargs):
+        # attention_mask: compute attention on the places where the value is 1
+        if hasattr(self, "embed_tokens"):
+            hidden_states = self.embed_tokens(input_ids)
+            if self.embed_grad_scale != 1:
+                hidden_states = (
+                    self.embed_grad_scale * hidden_states + (1 - self.embed_grad_scale) * hidden_states.detach()
+                )
+            hidden_states = hidden_states * (self.hidden_size**0.5)
+
+        for _, block in enumerate(self.layers):
+            hidden_states = block(hidden_states, residual=None, **kwargs)
+
+        if hasattr(self, "norm"):
+            hidden_states = self.norm(hidden_states.to(self.norm.weight.dtype))
+        if hasattr(self, "extra_pred_tokens") and self.extra_pred_tokens > 0:
+            extra_hidden_states_list = [self.extra_outputs[i](hidden_states) for i in range(self.extra_pred_tokens)]
+        else:
+            extra_hidden_states_list = None
+        if hasattr(self, "output"):
+            hidden_states = self.output(hidden_states)
+
+        if extra_hidden_states_list is not None:
+            return (hidden_states, extra_hidden_states_list)
+
+        return hidden_states
+
+    @staticmethod
+    def load_hf_weights(folder: str, model: nn.Module) -> None:
+        assert folder is not None, "Please specify the folder of the pretrained model"
+        if gpc.is_rank_for_log():
+            logger.info(f"Loading pretrained model from {folder}")
+
+        fns = get_fns(folder)
+        model_fns = [
+            os.path.join(folder, fn)
+            for fn in fns
+            if (fn.endswith(".bin") and fn.startswith("pytorch_model"))
+            or (fn.endswith(".safetensors") and fn.startswith("model"))
+        ]
+        model_fns.sort()
+
+        state_dict = {}
+        for model_fn in model_fns:
+            state_dict.update(llm_load(model_fn, map_location="cpu"))
+
+        tp_size = gpc.get_world_size(ParallelMode.TENSOR)
+        tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+        wp_size = gpc.get_world_size(ParallelMode.WEIGHT)
+        wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT)
+        tp_mode = gpc.config.parallel.tensor["mode"]
+        split_size = wp_size if tp_mode == "isp" else tp_size
+        local_rank = wp_rank if tp_mode == "isp" else tp_rank
+        row_dim = 0 if tp_mode == "isp" else 1
+        if gpc.config.model.get("embed_split_hidden", True):
+            embed_concat_dim = 1
+        else:
+            embed_concat_dim = 0
+
+        new_state_dict = {}
+
+        # embedding
+        if (gpc.get_local_rank(ParallelMode.PIPELINE) == 0) or (not gpc.is_using_parallel_mode(ParallelMode.PIPELINE)):
+            new_state_dict["embed_tokens.weight"] = torch.chunk(
+                state_dict.get("model.embed_tokens.weight"),
+                split_size,
+                dim=embed_concat_dim,
+            )[local_rank]
+
+        for idx, i in enumerate(range(model.first_layer, model.last_layer)):
+            layer_ids = i
+
+            # attn
+            state_dict[f"layers.{i}.attention.wq.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.q_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.attention.wk.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.k_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.attention.wv.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.v_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.attention.wo.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.o_proj.weight"),
+                split_size,
+                dim=row_dim,
+            )[local_rank]
+
+            # ffn
+            state_dict[f"layers.{i}.feed_forward.w1.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.mlp.gate_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.feed_forward.w3.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.mlp.up_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.feed_forward.w2.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.mlp.down_proj.weight"),
+                split_size,
+                dim=row_dim,
+            )[local_rank]
+
+            # attn norm
+            state_dict[f"layers.{i}.attention_norm.weight"] = state_dict.pop(
+                f"model.layers.{layer_ids}.input_layernorm.weight"
+            )
+            # ffn norm
+            state_dict[f"layers.{i}.ffn_norm.weight"] = state_dict.pop(
+                f"model.layers.{layer_ids}.post_attention_layernorm.weight"
+            )
+
+            # replace value within decoder layer
+            for name in list(state_dict.keys()):
+                if name.startswith(f"layers.{i}"):
+                    new_state_dict[name.replace(f".{i}.", f".{idx}.")] = state_dict.pop(name)
+
+        # output
+        if gpc.is_last_rank(ParallelMode.PIPELINE):
+            if "lm_head.weight" in state_dict:
+                new_state_dict["output.weight"] = torch.chunk(
+                    state_dict.pop("lm_head.weight"),  # we do not tie lm head with embedding
+                    split_size,
+                    dim=0,
+                )[local_rank]
+                state_dict.pop("model.embed_tokens.weight")
+            else:
+                new_state_dict["output.weight"] = torch.chunk(
+                    # gemma model ties lm head with embedding in transformers implementation
+                    state_dict.pop("model.embed_tokens.weight"),
+                    split_size,
+                    dim=0,
+                )[local_rank]
+            new_state_dict["norm.weight"] = state_dict.pop("model.norm.weight")
+
+        missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+        if len(state_dict) > 0:
+            logger.warning(f"Be cautious, checkpoint state_dict keys={state_dict.keys()} have not beed loaded.")
+
+        if gpc.get_local_rank(ParallelMode.DATA) == 0:
+            pp_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(ParallelMode.PIPELINE)
+            logger.info(
+                f"Missing keys:{missing_keys}, unexpected keys:{unexpected_keys} in "
+                f"tp:{gpc.get_local_rank(ParallelMode.TENSOR)}, pp:{pp_rank}"
+            )
+
+        internlm_accelerator.empty_cache()
+
+    @staticmethod
+    def convert_internevo2hf_weights(src: str, tgt: str) -> None:
+        model_config = gpc.config.model
+        tp_mode = gpc.config.parallel.tensor["mode"]
+        row_dim = 0 if tp_mode == "isp" else 1
+
+        # load states
+        states, num_shards = Gemma.load_sharded_states(src)
+
+        # convert state_dict
+        state_dict = {}
+        embedding_key_list = ["tok_embeddings.weight", "embed_tokens.weight", None]
+        for layer_i in tqdm(range(model_config["num_layers"])):
+            # attn norm, mlp norm
+            state_dict.update(
+                {
+                    f"model.layers.{layer_i}.input_layernorm.weight": states[0][
+                        f"layers.{layer_i}.attention_norm.weight"
+                    ].clone(),
+                    f"model.layers.{layer_i}.post_attention_layernorm.weight": states[0][
+                        f"layers.{layer_i}.ffn_norm.weight"
+                    ].clone(),
+                }
+            )
+            # attn wqkv weight and bias
+            state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wq.weight"] for i in range(num_shards)],
+                dim=0,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wk.weight"] for i in range(num_shards)],
+                dim=0,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wv.weight"] for i in range(num_shards)],
+                dim=0,
+            )
+            # attn wo weight
+            state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=row_dim
+            )
+
+            # mlp
+            state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=row_dim
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+            )
+
+        # embedding, head
+        for embedding_key in embedding_key_list:
+            if embedding_key in states[0]:
+                break
+        if embedding_key is None:
+            raise KeyError("Cannot find embedding key!")
+        if model_config["embed_split_hidden"]:
+            embed_concat_dim = 1
+            tok_emb_list = [states[i][embedding_key] for i in range(num_shards)]
+        else:
+            embed_concat_dim = 0
+            _, size_1 = states[0][embedding_key].shape
+            embdim_pertp = size_1 // num_shards
+            tok_emb_list = [
+                torch.concat(
+                    [
+                        states[tp][embedding_key][:, embdim_pertp * local_rank : embdim_pertp * (local_rank + 1)]
+                        for tp in range(num_shards)
+                    ],
+                    dim=0,
+                )
+                for local_rank in range(num_shards)
+            ]
+        state_dict.update(
+            {
+                "model.norm.weight": states[0]["norm.weight"],
+                "model.embed_tokens.weight": torch.cat(tok_emb_list, dim=embed_concat_dim),
+                "lm_head.weight": torch.cat([states[i]["output.weight"] for i in range(num_shards)], dim=0),
+            },
+        )
+
+        # save state_dict to hf format
+        shards, index = shard_checkpoint(state_dict, weights_name=SAFE_WEIGHTS_NAME)
+        for shard_file, shard in shards.items():
+            llm_save(save_path=os.path.join(tgt, shard_file), saved_obj=shard, metadata={"format": "pt"})
+        if index is not None:
+            # Save the index as well
+            llm_save(save_path=os.path.join(tgt, SAFE_WEIGHTS_INDEX_NAME), saved_obj=index)
diff --git a/internlm/model/modeling_qwen2.py b/internlm/model/modeling_qwen2.py
new file mode 100644
index 00000000..d3700baa
--- /dev/null
+++ b/internlm/model/modeling_qwen2.py
@@ -0,0 +1,752 @@
+# Copyright (c) InternLM. All rights reserved.
+import math
+import os
+from typing import Optional
+
+import torch
+from torch import nn
+from tqdm import tqdm
+
+from internlm.accelerator import get_accelerator
+from internlm.core.context import ParallelMode
+from internlm.core.context.parallel_context import global_context as gpc
+from internlm.initialize.initialize_tensor import (
+    normal_,
+    scaled_init_method_normal,
+    scaled_init_method_uniform,
+    uniform_,
+)
+from internlm.model.base_model import BaseModel
+from internlm.model.modules.embedding import Embedding1D
+from internlm.model.modules.linear import new_linear
+from internlm.model.modules.mha import SWA
+from internlm.model.modules.mlp import new_feed_forward
+from internlm.model.modules.norm import new_layer_norm
+from internlm.model.utils import (
+    convert_attn_args_to_kwargs,
+    convert_attn_kwargs_to_args,
+)
+from internlm.solver.activation_checkpoint import activation_checkpoint
+from internlm.utils.logger import get_logger
+from internlm.utils.storage_manager import get_fns, llm_load, llm_save
+from transformers.modeling_utils import (
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    shard_checkpoint,
+)
+
+internlm_accelerator = get_accelerator()
+logger = get_logger(__file__)
+
+
+class Qwen2Decoder(nn.Module):
+    """
+    1D Packed Flash Qwen Layer.
+
+    Args:
+        hidden_size (int): The hidden size of model. 768 by default.
+        num_attention_heads (int): The number of attention heads. 12 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0 by default.
+        drop_rate (float): The dropout rate of the input hidden state. 0.0 by default.
+        dtype (torch.dtype): Type of data. torch.float by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-5 by default.
+        checkpoint (bool): Whether to use checkpointing to save VRAM. True by default.
+        layer_idx (int): The index of current layer. 0 by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        norm_type (str): Use RMS norm or layernorm."rmsnorm" by default.
+        attn_wqkv_init_std (float): std used to init attn_wqkv weight. 0.02 by default,
+        attn_other_init_std (float): std used to init attn_other weight. 0.02 by default,
+        ffn_uplayer_init_std (float): std used to init w1, w2 weight in ffn when using glu
+            otherwise init fc1 weight in ffn. 0.02 by default,
+        ffn_other_init_std (float): std used to init ffn_other weight. 0.02 by default,
+        init_type (str): Initialization type. Use uniform or normal. "normal" by default,
+        rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
+        multiple_of (int): The value to make SwiGLU hidden layer size multiple of large power of 2.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int = 768,
+        num_attention_heads: int = 12,
+        num_kv_attention_heads: int = 12,
+        mlp_ratio: int = 4,
+        attn_drop_rate: float = 0,
+        drop_rate: float = 0.0,
+        max_position_embeddings: int = 2048,
+        dtype: torch.dtype = torch.float,
+        layer_norm_epsilon: float = 1e-6,
+        checkpoint: bool = False,
+        layer_idx: int = 0,
+        use_dynamic_ntk_rope: bool = False,
+        residual_in_fp32: bool = False,
+        device: Optional[torch.device] = None,
+        apply_post_layer_norm: bool = False,
+        fused_dropout_add_ln: bool = True,
+        qkv_bias=True,
+        o_bias=False,
+        mlp_bias=False,
+        norm_type: str = "rmsnorm",
+        qk_interleaved: bool = False,
+        dropout_selective_checkpoint: bool = True,
+        use_scaled_init: bool = True,
+        use_swiglu: bool = True,
+        attn_wqkv_init_std: float = 0.02,
+        attn_other_init_std: float = 0.02,
+        ffn_uplayer_init_std: float = 0.02,
+        ffn_other_init_std: float = 0.02,
+        init_type: str = "normal",
+        rope_type: str = "normal",
+        rope_base: int = 10000,
+        rope_scaling_factor: float = 1.0,
+        use_sliding_window: bool = False,
+        sliding_window: int = None,
+        mlp_layer_fusion: bool = False,
+        multiple_of: int = 256,
+        scale_attn_weights: bool = False,  # Qwen1
+        use_logn_attn: bool = False,  # Qwen1
+    ):
+        super().__init__()
+        self.checkpoint = checkpoint
+        # dropout selective checkpoint can only be enabled when checkpoint is disabled.
+        self.dropout_selective_checkpoint = dropout_selective_checkpoint is True and checkpoint is False
+        self.layer_idx = layer_idx
+        self.prenorm = not apply_post_layer_norm
+        assert not fused_dropout_add_ln, "dropout_add_layer_norm can not be used here"
+        self.fused_dropout_add_ln = fused_dropout_add_ln
+        self.attn_wqkv_init_std = attn_wqkv_init_std
+        self.attn_other_init_std = attn_other_init_std
+        self.ffn_uplayer_init_std = ffn_uplayer_init_std
+        self.ffn_other_init_std = ffn_other_init_std
+
+        head_dim = hidden_size // num_attention_heads
+
+        if scale_attn_weights:
+            softmax_scale = None
+        else:
+            softmax_scale = 1 / math.sqrt(head_dim)
+        self.attention = SWA(
+            embed_dim=hidden_size,
+            num_heads=num_attention_heads,
+            num_kv_heads=num_kv_attention_heads,
+            dropout=attn_drop_rate,
+            max_position_embeddings=max_position_embeddings,
+            softmax_scale=softmax_scale,
+            causal=True,
+            layer_idx=layer_idx,
+            use_dynamic_ntk_rope=use_dynamic_ntk_rope,
+            rotary_emb_dim=head_dim,
+            rotary_emb_scale_base=0,
+            device=device,
+            dtype=dtype,
+            qk_interleaved=qk_interleaved,
+            qkv_bias=qkv_bias,
+            o_bias=o_bias,
+            rope_type=rope_type,
+            rope_base=rope_base,
+            rope_scaling_factor=rope_scaling_factor,
+            use_sliding_window=use_sliding_window,
+            sliding_window=sliding_window,
+            use_logn_attn=use_logn_attn,
+        )
+
+        self.dropout1 = nn.Dropout(drop_rate)
+        self.dropout2 = nn.Dropout(drop_rate)
+        self.attention_norm = new_layer_norm(norm_type, hidden_size, eps=layer_norm_epsilon)
+        self.ffn_norm = new_layer_norm(norm_type, hidden_size, eps=layer_norm_epsilon)
+
+        self.feed_forward = new_feed_forward(
+            hidden_size,
+            int(hidden_size * mlp_ratio),
+            out_features=hidden_size,
+            bias=mlp_bias,
+            device=device,
+            dtype=dtype,
+            mlp_layer_fusion=mlp_layer_fusion,
+            multiple_of=multiple_of,
+            activation_type="swiglu" if use_swiglu else "gelu",
+        )
+
+        self.use_swiglu = use_swiglu
+        self.use_scaled_init = use_scaled_init
+        self.residual_in_fp32 = residual_in_fp32  # only make sense when using prenorm
+        self.return_residual = False
+
+        if init_type == "normal":
+            self.init_func = normal_
+            self.scaled_init_func = scaled_init_method_normal
+        else:
+            self.init_func = uniform_
+            self.scaled_init_func = scaled_init_method_uniform
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        with torch.no_grad():
+            for name, param in self.attention.named_parameters():
+                if param.ndim == 1:
+                    param.data.zero_()
+                elif "wq" in name or "wk" in name or "wv" in name:
+                    self.init_func(std=self.attn_wqkv_init_std)(param.data)
+                elif self.use_scaled_init:  # wo
+                    self.scaled_init_func(sigma=self.attn_other_init_std, num_layers=self.layer_idx + 1)(param.data)
+                else:
+                    self.init_func(std=self.attn_other_init_std)(param.data)
+
+            for name, param in self.feed_forward.named_parameters():
+                if self.use_swiglu:
+                    if self.use_scaled_init and "w2" in name:
+                        self.scaled_init_func(sigma=self.ffn_other_init_std, num_layers=self.layer_idx + 1)(param.data)
+                    else:
+                        # candidate: w1, w3, fused_w1_w3
+                        self.init_func(
+                            std=self.ffn_uplayer_init_std if "w1" in name or "w3" in name else self.ffn_other_init_std
+                        )(param.data)
+                else:
+                    if self.use_scaled_init and "fc1" not in name:
+                        self.scaled_init_func(sigma=self.ffn_other_init_std, num_layers=self.layer_idx + 1)(param.data)
+                    else:
+                        self.init_func(std=self.ffn_uplayer_init_std if "fc1" in name else self.ffn_other_init_std)(
+                            param.data
+                        )
+
+    def forward(self, hidden_states, residual=None, **kwargs):
+        if self.checkpoint and self.training:
+            args = convert_attn_kwargs_to_args(kwargs)
+            return activation_checkpoint(self._forward, False, hidden_states, residual, *args)
+        else:
+            return self._forward(hidden_states, residual, **kwargs)
+
+    def _forward(self, hidden_states, residual, *args, **kwargs):
+        r"""Pass the input through the encoder layer.
+
+        Args:
+            hidden_states: the sequence to the encoder layer (required).
+            residual: hidden_states = Attn/MLP(LN(residual))
+            cu_seqlens: 1d LongTensor, len(cu_seqlens) = hidden_states + 1
+            indexes: the length of index is same as hidden states, which stand for the current position
+        """
+        if self.prenorm:
+
+            def _dropout_and_norm_attn(_residual, _hidden_states):
+                _dropped = self.dropout1(_hidden_states)
+                _residual = (_dropped + _residual) if _residual is not None else _dropped
+                _hidden_states = self.attention_norm(_residual.to(dtype=self.attention_norm.weight.dtype))
+
+                return _residual, _hidden_states
+
+            if self.dropout_selective_checkpoint:
+                residual, hidden_states = activation_checkpoint(_dropout_and_norm_attn, False, residual, hidden_states)
+            else:
+                residual, hidden_states = _dropout_and_norm_attn(residual, hidden_states)
+
+            if self.residual_in_fp32:
+                residual = residual.to(torch.float32)
+
+            mixer_kwargs = convert_attn_args_to_kwargs(args, kwargs)
+            hidden_states = self.attention(hidden_states, **mixer_kwargs)
+
+            if not isinstance(self.feed_forward, nn.Identity):
+                if not self.fused_dropout_add_ln:
+
+                    def _dropout_and_norm_ffn(_residual, _hidden_states):
+                        _dropped = self.dropout2(_hidden_states)
+                        _residual = (_dropped + _residual) if _residual is not None else _dropped
+                        _hidden_states = self.ffn_norm(_residual.to(self.ffn_norm.weight.dtype))
+
+                        return _residual, _hidden_states
+
+                    if self.dropout_selective_checkpoint:
+                        residual, hidden_states = activation_checkpoint(
+                            _dropout_and_norm_ffn, False, residual, hidden_states
+                        )
+                    else:
+                        residual, hidden_states = _dropout_and_norm_ffn(residual, hidden_states)
+
+                    if self.residual_in_fp32:
+                        residual = residual.to(torch.float32)
+                hidden_states = self.feed_forward(hidden_states)
+
+            return hidden_states + residual
+        else:
+            assert residual is None
+
+            mixer_out = self.attention(hidden_states, **kwargs)
+            if self.return_residual:  # mixer out is actually a pair here
+                mixer_out, hidden_states = mixer_out
+            hidden_states = self.attention_norm(self.dropout1(mixer_out) + hidden_states).to(
+                dtype=self.attention_norm.weight.dtype
+            )
+            if not isinstance(self.feed_forward, nn.Identity):
+                mlp_out = self.feed_forward(hidden_states)
+                if self.return_residual:  # mlp out is actually a pair here
+                    mlp_out, hidden_states = mlp_out
+                hidden_states = self.ffn_norm((self.dropout2(mlp_out)) + hidden_states).to(
+                    dtype=self.ffn_norm.weight.dtype
+                )
+            return hidden_states
+
+
+class Qwen2(BaseModel):
+    """
+    1D Packed Flash Qwen.
+
+    Args:
+        num_layers (int): The number of layer. 12 by default.
+        hidden_size (int): The size of hidden state. 768 by default.
+        num_attention_heads (int): The number of attention head. 12 by default.
+        vocab_size (int): The size of vocabulary. 50304 by default.
+        mlp_ratio (int): The ratio of MLP layers. 4 by default.
+        attn_drop_rate (float): The dropout rate of attention module. 0.0 by default.
+        drop_rate (float): The dropout rate of input hidden state. 0.0 by default.
+        dtype (torch.dtype): The type of data. torch.float by default.
+        checkpoint (bool): Whether to use checkpointing to save VRAM. True by default.
+        layer_norm_epsilon (float): A value added to the denominator for numerical stability. 1e-6 by default.
+        first (bool): Whether input embedding layer or not. False by default.
+        last (bool): Whether output embedding layer or not. False by default.
+        embed_grad_scale (float): Refer to GLM-130B, for training stability. 0.1 by default.
+        parallel_output (bool): If it is necessary to collect the output of parallel computing. True by default.
+        start_layer_idx (int): The index of start layer in the pipeline. 0 by default.
+        device (Optional[Union[str, torch.device]]): The device will be used. None by default.
+        residual_in_fp32 (bool): Whether to use residual in fp32. False by default.
+        norm_type (str): Normalization type. Use RMSNorm or LayerNorm. "rmsnorm" by default.
+        embedding_init_std (float): std used to init embedding weight. 0.02 by default,
+        attn_wqkv_init_std (float): std used to init attn_wqkv weight. 0.02 by default,
+        attn_other_init_std (float): std used to init attn_other weight. 0.02 by default,
+        ffn_uplayer_init_std (float): std used to init w1, w2 weight in ffn when using glu
+            otherwise init fc1 weight in ffn. 0.02 by default,
+        ffn_other_init_std (float): std used to init ffn_other weight. 0.02 by default,
+        out_head_init_std (float): std used to init output lmhead weight. 0.02 by default,
+        init_type (str): Initialization type. Use uniform or normal. "normal" by default,
+        extra_pred_tokens (int): The number of extra output head for multi-token-prediction. 0 by default.
+        rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
+        multiple_of (int): The value to make SwiGLU hidden layer size multiple of large power of 2.
+    """
+
+    def __init__(
+        self,
+        num_layers: int = 12,
+        hidden_size: int = 768,
+        num_attention_heads: int = 12,
+        num_kv_attention_heads: int = 12,
+        vocab_size: int = 50304,
+        mlp_ratio: int = 4,
+        attn_drop_rate: float = 0.0,
+        drop_rate: float = 0.0,
+        max_position_embeddings: int = 2048,
+        dtype: torch.dtype = torch.float,
+        checkpoint: float = 1.0,
+        layer_norm_epsilon: float = 1e-5,
+        first: bool = False,
+        last: bool = False,
+        embed_grad_scale: float = 0.1,
+        parallel_output: bool = True,
+        start_layer_idx: int = 0,
+        use_dynamic_ntk_rope: bool = False,
+        device: Optional[torch.device] = None,
+        apply_post_layer_norm=False,
+        qkv_bias=True,
+        o_bias=False,
+        mlp_bias=False,
+        residual_in_fp32: bool = False,
+        norm_type: str = "rmsnorm",
+        qk_interleaved: bool = False,
+        is_reward: bool = False,
+        dropout_selective_checkpoint: bool = True,
+        use_scaled_init: bool = True,
+        use_swiglu: bool = True,
+        embedding_init_std: float = 0.02,
+        attn_wqkv_init_std: float = 0.02,
+        attn_other_init_std: float = 0.02,
+        ffn_uplayer_init_std: float = 0.02,
+        ffn_other_init_std: float = 0.02,
+        out_head_init_std: float = 0.02,
+        init_type: str = "normal",
+        extra_pred_tokens: int = 0,
+        rope_type: str = "normal",
+        rope_base: int = 10000,
+        rope_scaling_factor: float = 1.0,
+        use_sliding_window: bool = False,
+        max_window_layers: int = 0,
+        sliding_window: int = None,
+        mlp_layer_fusion: bool = False,
+        multiple_of: int = 256,
+        scale_attn_weights: bool = False,  # Qwen1
+        use_logn_attn: bool = False,  # Qwen1
+    ):
+        super().__init__()
+
+        self.embed_grad_scale = embed_grad_scale
+
+        checkpoint_layer_num = int(num_layers * checkpoint)
+
+        if first:
+            self.embed_tokens = Embedding1D(num_embeddings=vocab_size, embedding_dim=hidden_size)
+            for _, param in self.embed_tokens.named_parameters():
+                if init_type == "normal":
+                    normal_(std=embedding_init_std)(param)
+                else:
+                    uniform_(std=embedding_init_std)(param)
+
+        self.layers = nn.ModuleList(
+            [
+                Qwen2Decoder(
+                    hidden_size=hidden_size,
+                    num_attention_heads=num_attention_heads,
+                    num_kv_attention_heads=num_kv_attention_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_drop_rate=attn_drop_rate,
+                    drop_rate=drop_rate,
+                    dtype=dtype,
+                    layer_norm_epsilon=layer_norm_epsilon,
+                    checkpoint=lid < checkpoint_layer_num,
+                    layer_idx=lid + start_layer_idx,  # This parameter is used for caching during generation
+                    use_dynamic_ntk_rope=use_dynamic_ntk_rope,
+                    residual_in_fp32=residual_in_fp32,
+                    device=device,
+                    apply_post_layer_norm=apply_post_layer_norm,
+                    fused_dropout_add_ln=False,
+                    qkv_bias=qkv_bias,
+                    o_bias=o_bias,
+                    mlp_bias=mlp_bias,
+                    norm_type=norm_type,
+                    dropout_selective_checkpoint=dropout_selective_checkpoint,
+                    use_scaled_init=use_scaled_init,
+                    use_swiglu=use_swiglu,
+                    qk_interleaved=qk_interleaved,
+                    attn_wqkv_init_std=attn_wqkv_init_std,
+                    attn_other_init_std=attn_other_init_std,
+                    ffn_uplayer_init_std=ffn_uplayer_init_std,
+                    ffn_other_init_std=ffn_other_init_std,
+                    init_type=init_type,
+                    rope_type=rope_type,
+                    rope_base=rope_base,
+                    rope_scaling_factor=rope_scaling_factor,
+                    use_sliding_window=use_sliding_window and lid >= max_window_layers,
+                    sliding_window=sliding_window,
+                    mlp_layer_fusion=mlp_layer_fusion,
+                    multiple_of=multiple_of,
+                    max_position_embeddings=max_position_embeddings,
+                    scale_attn_weights=scale_attn_weights,
+                    use_logn_attn=use_logn_attn,
+                )
+                for lid in range(num_layers)
+            ]
+        )
+
+        if last:
+            if not apply_post_layer_norm:
+                self.norm = new_layer_norm(norm_type, hidden_size, eps=layer_norm_epsilon)
+
+            self.output = new_linear(
+                name="output",
+                in_features=hidden_size,
+                out_features=gpc.get_world_size(ParallelMode.TENSOR) if is_reward else vocab_size,
+                bias=False,
+                device=device,
+                dtype=dtype,
+                is_reward=is_reward,
+                weight_scale=embed_grad_scale,
+            )
+
+            for _, param in self.output.named_parameters():
+                if init_type == "normal":
+                    normal_(std=out_head_init_std)(param)
+                else:
+                    uniform_(std=out_head_init_std)(param)
+
+            if extra_pred_tokens > 0:
+                self.extra_pred_tokens = extra_pred_tokens
+                assert not is_reward, "extra_pred_tokens > 0 means using multi token prediction, not implement for RLHF"
+                self.extra_outputs = nn.ModuleList(
+                    [
+                        new_linear(
+                            name="output",
+                            in_features=hidden_size,
+                            out_features=vocab_size,
+                            bias=False,
+                            device=device,
+                            dtype=dtype,
+                            is_reward=is_reward,
+                            weight_scale=embed_grad_scale,
+                        )
+                        for _ in range(self.extra_pred_tokens)
+                    ]
+                )
+                for _, param in self.extra_outputs.named_parameters():
+                    if init_type == "normal":
+                        normal_(std=out_head_init_std)(param)
+                    else:
+                        uniform_(std=out_head_init_std)(param)
+
+        self.parallel_output = parallel_output
+
+    def forward(self, hidden_states=None, input_ids=None, **kwargs):
+        # attention_mask: compute attention on the places where the value is 1
+        if hasattr(self, "embed_tokens"):
+            hidden_states = self.embed_tokens(input_ids)
+            if self.embed_grad_scale != 1:
+                hidden_states = (
+                    self.embed_grad_scale * hidden_states + (1 - self.embed_grad_scale) * hidden_states.detach()
+                )
+
+        for _, block in enumerate(self.layers):
+            hidden_states = block(
+                hidden_states,
+                residual=None,
+                **kwargs,
+            )
+
+        if hasattr(self, "norm"):
+            hidden_states = self.norm(hidden_states.to(self.norm.weight.dtype))
+        if hasattr(self, "extra_pred_tokens") and self.extra_pred_tokens > 0:
+            extra_hidden_states_list = [self.extra_outputs[i](hidden_states) for i in range(self.extra_pred_tokens)]
+        else:
+            extra_hidden_states_list = None
+        if hasattr(self, "output"):
+            hidden_states = self.output(hidden_states)
+
+        if extra_hidden_states_list is not None:
+            return (hidden_states, extra_hidden_states_list)
+
+        return hidden_states
+
+    @staticmethod
+    def load_hf_weights(folder: str, model: nn.Module) -> None:
+        assert folder is not None, "Please specify the folder of the pretrained model"
+        if gpc.is_rank_for_log():
+            logger.info(f"Loading pretrained model from {folder}")
+
+        fns = get_fns(folder)
+        model_fns = [
+            os.path.join(folder, fn)
+            for fn in fns
+            if (fn.endswith(".bin") and fn.startswith("pytorch_model"))
+            or (fn.endswith(".safetensors") and fn.startswith("model"))
+        ]
+        model_fns.sort()
+
+        state_dict = {}
+        for model_fn in model_fns:
+            state_dict.update(llm_load(model_fn, map_location="cpu"))
+
+        tp_size = gpc.get_world_size(ParallelMode.TENSOR)
+        tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+        wp_size = gpc.get_world_size(ParallelMode.WEIGHT)
+        wp_rank = gpc.get_local_rank(ParallelMode.WEIGHT)
+        tp_mode = gpc.config.parallel.tensor["mode"]
+        split_size = wp_size if tp_mode == "isp" else tp_size
+        local_rank = wp_rank if tp_mode == "isp" else tp_rank
+        row_dim = 0 if tp_mode == "isp" else 1
+        if gpc.config.model.get("embed_split_hidden", True):
+            embed_concat_dim = 1
+        else:
+            embed_concat_dim = 0
+
+        new_state_dict = {}
+
+        # embedding
+        if (gpc.get_local_rank(ParallelMode.PIPELINE) == 0) or (not gpc.is_using_parallel_mode(ParallelMode.PIPELINE)):
+            new_state_dict["embed_tokens.weight"] = torch.chunk(
+                state_dict.pop("model.embed_tokens.weight"),
+                split_size,
+                dim=embed_concat_dim,
+            )[local_rank]
+
+        for idx, i in enumerate(range(model.first_layer, model.last_layer)):
+            layer_ids = i
+
+            # attn
+            state_dict[f"layers.{i}.attention.wq.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.q_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.attention.wq.bias"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.q_proj.bias"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.attention.wk.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.k_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.attention.wk.bias"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.k_proj.bias"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.attention.wv.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.v_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.attention.wv.bias"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.v_proj.bias"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.attention.wo.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.self_attn.o_proj.weight"),
+                split_size,
+                dim=row_dim,
+            )[local_rank]
+
+            # ffn
+            state_dict[f"layers.{i}.feed_forward.w1.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.mlp.gate_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.feed_forward.w3.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.mlp.up_proj.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            state_dict[f"layers.{i}.feed_forward.w2.weight"] = torch.chunk(
+                state_dict.pop(f"model.layers.{layer_ids}.mlp.down_proj.weight"),
+                split_size,
+                dim=row_dim,
+            )[local_rank]
+
+            # attn norm
+            state_dict[f"layers.{i}.attention_norm.weight"] = state_dict.pop(
+                f"model.layers.{layer_ids}.input_layernorm.weight"
+            )
+            # ffn norm
+            state_dict[f"layers.{i}.ffn_norm.weight"] = state_dict.pop(
+                f"model.layers.{layer_ids}.post_attention_layernorm.weight"
+            )
+
+            # replace value within decoder layer
+            for name in list(state_dict.keys()):
+                if name.startswith(f"layers.{i}"):
+                    new_state_dict[name.replace(f".{i}.", f".{idx}.")] = state_dict.pop(name)
+
+        # output
+        if gpc.is_last_rank(ParallelMode.PIPELINE):
+            new_state_dict["output.weight"] = torch.chunk(
+                state_dict.pop("lm_head.weight"),
+                split_size,
+                dim=0,
+            )[local_rank]
+            new_state_dict["norm.weight"] = state_dict.pop("model.norm.weight")
+
+        missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+        if len(state_dict) > 0:
+            logger.warning(f"Be cautious, checkpoint state_dict keys={state_dict.keys()} have not beed loaded.")
+
+        if gpc.get_local_rank(ParallelMode.DATA) == 0:
+            pp_rank = 0 if not gpc.is_initialized(ParallelMode.PIPELINE) else gpc.get_local_rank(ParallelMode.PIPELINE)
+            logger.info(
+                f"Missing keys:{missing_keys}, unexpected keys:{unexpected_keys} in "
+                f"tp:{gpc.get_local_rank(ParallelMode.TENSOR)}, pp:{pp_rank}"
+            )
+
+        internlm_accelerator.empty_cache()
+
+    @staticmethod
+    def convert_internevo2hf_weights(src: str, tgt: str) -> None:
+        model_config = gpc.config.model
+        tp_mode = gpc.config.parallel.tensor["mode"]
+        row_dim = 0 if tp_mode == "isp" else 1
+
+        # load states
+        states, num_shards = Qwen2.load_sharded_states(src)
+
+        # convert state_dict
+        state_dict = {}
+        embedding_key_list = ["tok_embeddings.weight", "embed_tokens.weight", None]
+        for layer_i in tqdm(range(model_config["num_layers"])):
+            # attn norm, mlp norm
+            state_dict.update(
+                {
+                    f"model.layers.{layer_i}.input_layernorm.weight": states[0][
+                        f"layers.{layer_i}.attention_norm.weight"
+                    ].clone(),
+                    f"model.layers.{layer_i}.post_attention_layernorm.weight": states[0][
+                        f"layers.{layer_i}.ffn_norm.weight"
+                    ].clone(),
+                }
+            )
+            # attn wqkv weight and bias
+            state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wq.weight"] for i in range(num_shards)],
+                dim=0,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.q_proj.bias"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wq.bias"] for i in range(num_shards)],
+                dim=0,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wk.weight"] for i in range(num_shards)],
+                dim=0,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.k_proj.bias"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wk.bias"] for i in range(num_shards)],
+                dim=0,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wv.weight"] for i in range(num_shards)],
+                dim=0,
+            )
+            state_dict[f"model.layers.{layer_i}.self_attn.v_proj.bias"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wv.bias"] for i in range(num_shards)],
+                dim=0,
+            )
+            # attn wo weight
+            state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=row_dim
+            )
+
+            # mlp
+            state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=row_dim
+            )
+            state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
+                [states[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+            )
+
+        # embedding, head
+        for embedding_key in embedding_key_list:
+            if embedding_key in states[0]:
+                break
+        if embedding_key is None:
+            raise KeyError("Cannot find embedding key!")
+        if model_config["embed_split_hidden"]:
+            embed_concat_dim = 1
+            tok_emb_list = [states[i][embedding_key] for i in range(num_shards)]
+        else:
+            embed_concat_dim = 0
+            _, size_1 = states[0][embedding_key].shape
+            embdim_pertp = size_1 // num_shards
+            tok_emb_list = [
+                torch.concat(
+                    [
+                        states[tp][embedding_key][:, embdim_pertp * local_rank : embdim_pertp * (local_rank + 1)]
+                        for tp in range(num_shards)
+                    ],
+                    dim=0,
+                )
+                for local_rank in range(num_shards)
+            ]
+        state_dict.update(
+            {
+                "model.norm.weight": states[0]["norm.weight"],
+                "model.embed_tokens.weight": torch.cat(tok_emb_list, dim=embed_concat_dim),
+                "lm_head.weight": torch.cat([states[i]["output.weight"] for i in range(num_shards)], dim=0),
+            },
+        )
+
+        # save state_dict to hf format
+        shards, index = shard_checkpoint(state_dict, weights_name=SAFE_WEIGHTS_NAME)
+        for shard_file, shard in shards.items():
+            llm_save(save_path=os.path.join(tgt, shard_file), saved_obj=shard, metadata={"format": "pt"})
+        if index is not None:
+            # Save the index as well
+            llm_save(save_path=os.path.join(tgt, SAFE_WEIGHTS_INDEX_NAME), saved_obj=index)
diff --git a/internlm/model/modules/mha.py b/internlm/model/modules/mha.py
index cd8eaff2..8370606b 100644
--- a/internlm/model/modules/mha.py
+++ b/internlm/model/modules/mha.py
@@ -1,5 +1,6 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
+import inspect
 import math
 from typing import Callable, Dict, Optional
 
@@ -75,6 +76,7 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         qk_interleaved: Optional[bool] = True,
         enable_qkv_fusion: bool = True,
+        out_bias: bool = True,
     ) -> None:
         super().__init__()
         self.layer_idx = layer_idx
@@ -83,6 +85,7 @@ def __init__(
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.head_dim = self.embed_dim // num_heads
+        self.kv_dim = self.head_dim * num_heads  # num_kv_heads equals to num_heads in MHA
         self.enable_qkv_fusion = enable_qkv_fusion
 
         self.use_dynamic_ntk_rope = use_dynamic_ntk_rope
@@ -116,8 +119,8 @@ def __init__(
         self.inner_attn = SelfAttention(causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout)
         self.inner_cross_attn = CrossAttention(causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout)
 
-        # output projection always have the bias (for now)
-        self.out_proj = new_linear("out_proj", embed_dim, embed_dim, bias=True, **factory_kwargs)
+        # output projection always have the bias (for now) (except for baichuan2 model)
+        self.out_proj = new_linear("out_proj", embed_dim, embed_dim, bias=out_bias, **factory_kwargs)
 
     def register_checkpoint_compatibility_hooks(
         self, pre_load_hook: Optional[Callable] = None, pre_save_hook: Optional[Callable] = None
@@ -355,6 +358,7 @@ def __init__(
         num_heads: int,
         num_kv_heads: int,
         max_position_embeddings: int = 2048,
+        head_dim: int = None,
         bias: bool = False,
         dropout: float = 0.0,
         softmax_scale: float = None,
@@ -375,9 +379,15 @@ def __init__(
 
         self.embed_dim = embed_dim
         self.num_heads = num_heads
+
+        if head_dim:
+            self.head_dim = head_dim
+            q_dim = head_dim * num_heads
+        else:
+            self.head_dim = self.embed_dim // num_heads
+            q_dim = embed_dim
         self.num_kv_heads = num_kv_heads
         self.q_per_kv = num_heads // num_kv_heads
-        self.head_dim = self.embed_dim // num_heads
         self.kv_dim = self.head_dim * num_kv_heads
         self.enable_qkv_fusion = enable_qkv_fusion
 
@@ -405,7 +415,7 @@ def __init__(
         if enable_qkv_fusion:
             self.wqkv = new_linear("wqkv", embed_dim, embed_dim + 2 * self.kv_dim, bias, **factory_kwargs)
         else:
-            self.wq = new_linear("wq", embed_dim, embed_dim, bias, **factory_kwargs)
+            self.wq = new_linear("wq", embed_dim, q_dim, bias, **factory_kwargs)
             self.wk = new_linear("wk", embed_dim, self.kv_dim, bias, **factory_kwargs)
             self.wv = new_linear("wv", embed_dim, self.kv_dim, bias, **factory_kwargs)
 
@@ -416,7 +426,7 @@ def __init__(
             causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout, layer_idx=layer_idx
         )
 
-        self.wo = new_linear("wo", embed_dim, embed_dim, bias, **factory_kwargs)
+        self.wo = new_linear("wo", q_dim, embed_dim, bias, **factory_kwargs)
 
     def register_checkpoint_compatibility_hooks(
         self, pre_load_hook: Optional[Callable] = None, pre_save_hook: Optional[Callable] = None
@@ -624,3 +634,337 @@ def _inference(self, x, inference_params, **kwargs):  # pylint: disable=W0613
 
         # wo
         return self.wo(rearrange(context, "b s h d -> b s (h d)"))
+
+
+try:
+    from flash_attn import flash_attn_func
+
+    # flash_attn >= v2.3.0
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+except (ModuleNotFoundError, ImportError):
+    _flash_supports_window_size = False
+
+
+class SWA(nn.Module):
+    """
+    sliding window attention
+
+    Args:
+        embed_dim (int): The dimention of hidden state.
+        num_heads (int): The number of attention heads.
+        process_group (torch.distributed.ProcessGroup): The group of the current device for `parallel_mode`.
+        sequence_process_group (torch.distributed.ProcessGroup): The process group for attention calculation.
+        bias (boolean): Whether the bias is needed for linears. Will be used when initializing QKV matrix and
+                        output projection. True by default.
+        dropout (float): The dropout rate for cross attention and self attention. 0.0 by default.
+        softmax_scale (float): The temperature to use for the softmax attention.
+        causal (boolean): Whether to apply causal attention mask. False by default.
+        layer_idx (int): The index of current layer. None by default.
+        rotary_emb_dim (int): The dimention of Rotary Embedding. 0 by default.
+        rotary_emb_scale_base (int): The scaling factor of Rotary Embedding. If scale_base > 0, this implements
+                                    XPos(Sun et al., https://arxiv.org/abs/2212.10554). 0 by default.
+        device (Optional[Union[str, torch.device]]): The device will be used.
+        dtype (Optional[torch.dtype]): The type of data.
+        rope_base (int): The value of `base` for rotary position embeddings. 10000 by default.
+        tp_mode (str): The string value of tensor parallel mode, should be in ["mtp", "msp", "fsp", "isp"],
+                       "mtp" by default.
+
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        num_kv_heads: int,
+        qkv_bias: bool = True,
+        o_bias: bool = False,
+        max_position_embeddings: int = 2048,
+        dropout: float = 0.0,
+        softmax_scale: float = None,
+        causal: bool = False,
+        layer_idx: int = None,
+        use_dynamic_ntk_rope: bool = False,
+        rope_type: str = "normal",
+        rope_base: int = 10000,
+        rope_scaling_factor: float = 1.0,
+        rotary_emb_dim: int = 0,
+        rotary_emb_scale_base: int = 0,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        use_sliding_window: bool = False,
+        sliding_window: int = None,
+        tp_mode: str = "mtp",
+        qk_interleaved: Optional[bool] = True,
+        use_logn_attn: bool = False,  # Qwen1
+    ) -> None:
+        assert embed_dim % num_heads == 0, "embedding dim must be divisible by num_heads"
+        assert (not use_sliding_window) or (
+            sliding_window is not None
+        ), "Must set `sliding windows` size when `use_sliding_window` is True."
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.head_dim = self.embed_dim // num_heads
+        self.num_kv_heads = num_kv_heads
+        self.kv_dim = self.head_dim * num_kv_heads
+        self.causal = causal
+        self.layer_idx = layer_idx
+        self.use_dynamic_ntk_rope = use_dynamic_ntk_rope
+        self.rotary_emb_dim = rotary_emb_dim
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window
+        self.dtype = dtype
+        self.tp_mode = tp_mode
+        self.rope_type = rope_type
+        self.use_logn_attn = use_logn_attn
+        self.interleaved = qk_interleaved
+
+        factory_kwargs = {"device": device, "dtype": dtype}
+
+        assert self.use_dynamic_ntk_rope is False, "Not support dynamic ntk rope yet."
+        assert self.embed_dim % num_heads == 0, "embedding dim must be divisible by num_heads"
+
+        if self.rotary_emb_dim > 0:
+            self.rotary_emb = new_rotary_embedding(
+                self.rotary_emb_dim,
+                base=rope_base,
+                scale_base=rotary_emb_scale_base,
+                device=device,
+                max_position_embeddings=max_position_embeddings,
+                scaling_factor=rope_scaling_factor,
+                rotary_type="dynamic_ntk" if self.use_dynamic_ntk_rope else "native",
+            )
+
+        # notice here should change bias=True
+        self.wq = new_linear(
+            "wq",
+            embed_dim,
+            embed_dim,
+            qkv_bias,
+            **factory_kwargs,
+        )
+        self.wk = new_linear(
+            "wk",
+            embed_dim,
+            self.kv_dim,
+            qkv_bias,
+            **factory_kwargs,
+        )
+        self.wv = new_linear(
+            "wv",
+            embed_dim,
+            self.kv_dim,
+            qkv_bias,
+            **factory_kwargs,
+        )
+
+        self.inner_attn = SelfAttention(causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout)
+        self.inner_cross_attn = CrossAttention(causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout)
+
+        self.inner_cross_attn_causal = causal
+        self.inner_cross_attn_softmax_scale = softmax_scale
+        self.inner_cross_attn_dropout = dropout
+
+        self.wo = new_linear(
+            "wo",
+            embed_dim,
+            embed_dim,
+            o_bias,
+            **factory_kwargs,
+        )
+
+    def forward(self, x, inference_params=None, **kwargs):
+        if inference_params is None:
+            return self._training(x=x, **kwargs)
+        else:
+            return self._inference(x=x, inference_params=inference_params, **kwargs)
+
+    def _training(self, x, **kwargs):
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        q = rearrange(q, "b t (h d) -> b t h d", d=self.head_dim)
+        k = rearrange(k, "b t (h d) -> b t h d", d=self.head_dim)
+        v = rearrange(v, "b t (h d) -> b t h d", d=self.head_dim)
+
+        kv_seq_len = k.size(0)
+        use_window_circumstance = (
+            _flash_supports_window_size
+            and self.use_sliding_window
+            and self.sliding_window
+            and kv_seq_len > self.sliding_window
+        )
+
+        kwargs = _convert_cu_seqlens_for_qksplited(kwargs)
+
+        # rotary embedding
+        if self.rotary_emb_dim > 0:
+            indexes = kwargs.pop("indexes", 0)
+            max_seqlen_q = kwargs.get("max_seqlen_q", None)
+            max_seqlen_k = kwargs.get("max_seqlen_k", None)
+
+            q = self.rotary_emb(
+                q, offsets=indexes, max_seqlen=max_seqlen_q, cache_type="query", interleaved=self.interleaved
+            )
+            k = self.rotary_emb(
+                k, offsets=indexes, max_seqlen=max_seqlen_k, cache_type="key", interleaved=self.interleaved
+            )
+
+        kv = torch.concat([k.unsqueeze(2), v.unsqueeze(2)], dim=2)
+
+        if use_window_circumstance:
+            kwargs["window_size"] = (self.sliding_window, 0)
+
+        # self attention
+        context = self.inner_attn(q, kv, **kwargs)
+
+        # wo
+        return self.wo(rearrange(context, "b s h d -> b s (h d)"))
+
+    def _convert_unpacked_qkv_to_packed(
+        self, q: torch.Tensor, kv: torch.Tensor, batch_size: int, attention_mask: torch.Tensor
+    ):
+        cu_seqlens = torch.concat(
+            [
+                torch.tensor([0], dtype=torch.int32, device=attention_mask.device),
+                attention_mask.sum(dim=-1).to(dtype=torch.int32),
+            ],
+            dim=0,
+        ).cumsum(dim=0, dtype=torch.int32)
+
+        cu_seqlens_q = cu_seqlens
+        cu_seqlens_k = cu_seqlens
+
+        max_seqlen_q = attention_mask.shape[-1]
+        max_seqlen_k = attention_mask.shape[-1]
+
+        q_packed = (
+            q.masked_select(attention_mask.view(batch_size, -1, 1, 1)).view(-1, q.shape[-2], q.shape[-1]).unsqueeze(0)
+        )
+        kv_packed = (
+            kv.masked_select(attention_mask.view(batch_size, -1, 1, 1, 1))
+            .view(-1, kv.shape[-3], kv.shape[-2], kv.shape[-1])
+            .unsqueeze(0)
+        )
+
+        return q_packed, kv_packed, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k
+
+    def _inference(self, x, inference_params=None, **kwargs):  # pylint: disable=W0613
+        assert inference_params is not None, "inference_params is required for inference"
+        assert self.layer_idx is not None, "Generation requires layer_idx in the constructor"
+        attention_mask = inference_params.attention_mask
+        sequence_len_offset = inference_params.sequence_len_offset
+        window_size = inference_params.window_size
+
+        bsz = x.shape[0]
+
+        q, k, v = self.wq(x), self.wk(x), self.wv(x)
+        q = rearrange(q, "b s (h d) -> b s h d", d=self.head_dim)
+        k = rearrange(k, "b s (h d) -> b s h d", d=self.head_dim)
+        v = rearrange(v, "b s (h d) -> b s h d", d=self.head_dim)
+
+        kv_seq_len = k.size(0)
+        use_window_circumstance = (
+            _flash_supports_window_size
+            and self.use_sliding_window
+            and self.sliding_window
+            and kv_seq_len > self.sliding_window
+        )
+
+        assert self.rotary_emb_dim > 0
+        if attention_mask is None:
+            raise NotImplementedError(
+                "You should make sure you are aware that you are changing the method of generating."
+                "According to your generation function instead of inference/seq_generator_module.py, "
+                "You may implement here for normal running."
+            )
+        else:
+            if inference_params.sequence_len_offset == 0:
+                q = self.rotary_emb(
+                    q, offsets=0, cache_type="query", interleaved=self.interleaved, left_padding_mask=attention_mask
+                )
+                k = self.rotary_emb(
+                    k, offsets=0, cache_type="key", interleaved=self.interleaved, left_padding_mask=attention_mask
+                )
+            else:
+                empties = attention_mask[..., -1].sum(dim=-1)
+                indexes4q = sequence_len_offset * torch.ones(q.size(0), dtype=torch.int, device=q.device) - empties
+                indexes4k = sequence_len_offset * torch.ones(k.size(0), dtype=torch.int, device=k.device) - empties
+                q = self.rotary_emb(q, offsets=indexes4q, cache_type="query", interleaved=self.interleaved)
+                k = self.rotary_emb(k, offsets=indexes4k, cache_type="key", interleaved=self.interleaved)
+
+        kv = torch.stack([k, v], dim=2)
+
+        if window_size is None or window_size > sequence_len_offset:
+            kv = update_kv_cache(kv, inference_params, self.layer_idx)
+        else:  # window_size <= sequence_len_offset
+            assert kv.size(1) == 1, "update kv length more than 1"
+
+            inference_params.key_value_memory_dict[self.layer_idx][
+                :, inference_params.keep_first : inference_params.window_size - 1, ...
+            ] = inference_params.key_value_memory_dict[self.layer_idx][
+                :, -(inference_params.window_size - 1 - inference_params.keep_first) :, ...
+            ].clone()
+            inference_params.real_sequence_len_offset = inference_params.sequence_len_offset
+            inference_params.sequence_len_offset = inference_params.window_size - 1
+
+            kv = update_kv_cache(kv, inference_params, self.layer_idx)
+
+            inference_params.sequence_len_offset = inference_params.real_sequence_len_offset
+
+        # When using FP16, there is a high probability of NAN in the KV.
+        # Since NAN cannot be removed by multiplying with and 0, it needs
+        # to be removed manually here.
+        kv = torch.where(torch.isnan(kv), 0, kv)
+
+        # attention
+        if attention_mask is None:
+            context = self.inner_cross_attn(q, kv)
+        else:
+            if sequence_len_offset == 0:  # First entrance, attnmask (bs*seqlen*seqlen)
+                attn_mask = attention_mask[:, None, ...]
+                attn_mask = torch.logical_or(torch.ones_like(attn_mask, dtype=torch.bool).triu(diagonal=1), attn_mask)
+                attn_mask4flsh = ~attn_mask[:, :, -1, :].view(bsz, -1)
+
+                if use_window_circumstance:
+                    output = self.inner_attn(
+                        *self._convert_unpacked_qkv_to_packed(q, kv, bsz, attn_mask4flsh),
+                        window_size=(self.sliding_window, 0),
+                    )
+                else:
+                    output = self.inner_attn(*self._convert_unpacked_qkv_to_packed(q, kv, bsz, attn_mask4flsh))
+                output = output.to(x.dtype)
+
+                context = torch.zeros_like(q).masked_scatter_(attn_mask4flsh.view(bsz, -1, 1, 1), output)
+
+            else:
+                attn_mask = attention_mask[:, -1, :].view(bsz, 1, 1, -1)
+                if window_size is not None and window_size <= sequence_len_offset:
+                    attn_mask = torch.concat(
+                        [
+                            attn_mask[..., : inference_params.keep_first],
+                            attn_mask[..., -(window_size - inference_params.keep_first) :],
+                        ],
+                        dim=-1,
+                    )
+
+                k, v = torch.chunk(kv, 2, dim=2)
+                k = k.squeeze(2)
+                v = v.squeeze(2)
+                sp = k.shape
+                expansion = q.size(2) // k.size(2)
+                scores = torch.einsum(
+                    "blhd,bnhd->bhln",
+                    q,
+                    k.unsqueeze(3).expand(-1, -1, -1, expansion, -1).reshape(sp[0], sp[1], q.size(2), sp[3]),
+                ) / math.sqrt(q.size(-1))
+                scores = scores.masked_fill(attn_mask, -65000.0)
+                scores = F.softmax(scores, dim=-1)  # bsz x h x L x L
+                context = torch.einsum(
+                    "bhmn,bnhd->bmhd",
+                    scores,
+                    v.unsqueeze(3).expand(-1, -1, -1, expansion, -1).reshape(sp[0], sp[1], q.size(2), sp[3]),
+                )
+
+        # wo
+        return self.wo(rearrange(context, "b s h d -> b s (h d)"))
diff --git a/internlm/model/modules/mlp.py b/internlm/model/modules/mlp.py
index 897e1363..b836ff3d 100644
--- a/internlm/model/modules/mlp.py
+++ b/internlm/model/modules/mlp.py
@@ -7,8 +7,9 @@
 from torch import nn
 
 from internlm.model.modules.linear import new_linear
-from internlm.model.modules.utils import Silu
+from internlm.model.modules.utils import Gelu, Silu
 from internlm.utils.logger import get_logger
+from internlm.utils.utils import ActivationType
 
 logger = get_logger(__file__)
 
@@ -71,10 +72,13 @@ def __init__(
     ):
         super().__init__()
 
-        # TODO: support gelu...
-        assert activation_type in ("swiglu"), f"Unsupported activation type: {activation_type}"
+        assert activation_type in (
+            ActivationType.swiglu.name,
+            ActivationType.gelu.name,
+        ), f"Unsupported activation type: {activation_type}"
 
         self.mlp_layer_fusion = mlp_layer_fusion
+        self.activation_type = activation_type
 
         hidden_features = multiple_of * ((hidden_features + multiple_of - 1) // multiple_of)
 
@@ -98,7 +102,12 @@ def forward(self, x):
         else:
             fussed_out = self.fused_w1_w3(x)
             w1_o, w3_o = torch.split(fussed_out, fussed_out.shape[-1] // 2, dim=-1)
-        out = self.w2(Silu(w1_o, w3_o))
+
+        if self.activation_type is ActivationType.swiglu.name:
+            out = self.w2(Silu(w1_o, w3_o))
+        else:
+            out = self.w2(Gelu(w1_o, w3_o))
+
         return out
 
 
diff --git a/internlm/model/modules/norm.py b/internlm/model/modules/norm.py
index b94cdd43..2a9700f8 100644
--- a/internlm/model/modules/norm.py
+++ b/internlm/model/modules/norm.py
@@ -2,6 +2,7 @@
 layer norm modules
 """
 
+import inspect
 from typing import List, Union
 
 import torch
@@ -12,8 +13,12 @@
 Shape = Union[int, List[int], torch.Size]
 
 
-def new_layer_norm(norm_type: str, normalized_shape: Shape, eps: float = 1e-5):
+def new_layer_norm(norm_type: str, normalized_shape: Shape, eps: float = 1e-5, add_unit_offset=False):
     if norm_type == "rmsnorm":
-        return RMSNorm(normalized_shape, eps)
+        rmsnorm_params = inspect.signature(RMSNorm).parameters
+        if "add_unit_offset" in rmsnorm_params:
+            return RMSNorm(normalized_shape, eps, add_unit_offset)
+        else:
+            return RMSNorm(normalized_shape, eps)
     else:  # default: layernorm
         return nn.LayerNorm(normalized_shape, eps)
diff --git a/internlm/model/modules/utils.py b/internlm/model/modules/utils.py
index dd86cb1c..bf1ae048 100644
--- a/internlm/model/modules/utils.py
+++ b/internlm/model/modules/utils.py
@@ -20,7 +20,12 @@ def Silu(w1_o, w2_o):
     return F.silu(w1_o) * w2_o
 
 
+def Gelu(w1_o, w2_o):
+    return F.gelu(w1_o) * w2_o
+
+
 Silu = torch.jit.script(Silu)
+Gelu = torch.jit.script(Gelu)
 
 
 def update_kv_cache(kv, inference_params, layer_idx):
diff --git a/internlm/model/ops/norm.py b/internlm/model/ops/norm.py
index 3cd43dab..34e7c007 100644
--- a/internlm/model/ops/norm.py
+++ b/internlm/model/ops/norm.py
@@ -35,7 +35,7 @@
     torchnpu_rmsnorm_impl = False
 
 
-def manual_rms_norm(my_input, weight, normalized_shape, eps):
+def manual_rms_norm(my_input, weight, normalized_shape, eps, add_unit_offset=False):
     # layer norm should always be calculated in float32
     dims = tuple(i for i in range(-1, -len(normalized_shape) - 1, -1))
     variance = my_input.to(torch.float32).pow(2).mean(dims, keepdim=True)
@@ -48,13 +48,16 @@ def manual_rms_norm(my_input, weight, normalized_shape, eps):
     if weight.dtype in [torch.float16, torch.bfloat16]:
         my_input = my_input.to(weight.dtype)
 
-    return weight * my_input
+    if add_unit_offset:
+        return (1 + weight) * my_input
+    else:
+        return weight * my_input
 
 
 class _RMSNorm(torch.nn.Module):
     """A generic module for RMS normalization."""
 
-    def __init__(self, normalized_shape, eps=1e-5):
+    def __init__(self, normalized_shape, eps=1e-5, add_unit_offset=False):
         super().__init__()
 
         if isinstance(normalized_shape, numbers.Integral):
@@ -62,18 +65,22 @@ def __init__(self, normalized_shape, eps=1e-5):
         self.normalized_shape = torch.Size(normalized_shape)
         self.eps = eps
         self.weight = Parameter(torch.empty(*normalized_shape))
+        self.add_unit_offset = add_unit_offset
         self.reset_parameters()
 
     def forward(self, _input: torch.Tensor):
         if apex_rmsnorm_impl:
             _norm_func = mixed_dtype_fused_rms_norm_affine
+            return _norm_func(_input, self.weight, self.normalized_shape, self.eps)
         else:
             _norm_func = manual_rms_norm
-
-        return _norm_func(_input, self.weight, self.normalized_shape, self.eps)
+            return _norm_func(_input, self.weight, self.normalized_shape, self.eps, self.add_unit_offset)
 
     def reset_parameters(self):
-        init.ones_(self.weight)
+        if self.add_unit_offset:
+            init.zeros_(self.weight)
+        else:
+            init.ones_(self.weight)
 
     def extra_repr(self):
         return f"{self.normalized_shape}, eps={self.eps}, "
diff --git a/internlm/model/registry.py b/internlm/model/registry.py
index a1921ab6..c923ec20 100644
--- a/internlm/model/registry.py
+++ b/internlm/model/registry.py
@@ -3,11 +3,14 @@
 
 from typing import Callable
 
+from internlm.model.modeling_baichuan2 import Baichuan2
+from internlm.model.modeling_gemma import Gemma
 from internlm.model.modeling_internlm import InternLM1
 from internlm.model.modeling_internlm2 import InternLM2
 from internlm.model.modeling_llama import Llama2
 from internlm.model.modeling_llava import Llava
 from internlm.model.modeling_moe import Internlm1MoE
+from internlm.model.modeling_qwen2 import Qwen2
 from internlm.utils.common import SingletonMeta
 from internlm.utils.utils import ModelType
 
@@ -83,6 +86,9 @@ def register_model_initializer() -> None:
     model_initializer.register_module(ModelType.LLAMA2.name, Llama2)
     model_initializer.register_module(ModelType.INTERNLM_MoE.name, Internlm1MoE)
     model_initializer.register_module(ModelType.LLAVA.name, Llava)
+    model_initializer.register_module(ModelType.QWEN2.name, Qwen2)
+    model_initializer.register_module(ModelType.BAICHUAN2.name, Baichuan2)
+    model_initializer.register_module(ModelType.GEMMA.name, Gemma)
 
 
 register_model_initializer()
diff --git a/internlm/utils/utils.py b/internlm/utils/utils.py
index 03dee6df..ca6b3215 100644
--- a/internlm/utils/utils.py
+++ b/internlm/utils/utils.py
@@ -47,6 +47,9 @@ class ModelType(Enum):
     LLAMA2 = 3
     INTERNLM_MoE = 4
     LLAVA = 5
+    QWEN2 = 6
+    BAICHUAN2 = 7
+    GEMMA = 8
 
 
 class DataType(Enum):
@@ -61,6 +64,11 @@ class TensorParallelMode(Enum):
     isp = 4
 
 
+class ActivationType(Enum):
+    swiglu = 1
+    gelu = 2
+
+
 def check_attention_argument(*args, **kwargs) -> str:
     # self, qkv, ...
     # self, q, kv, ....