Skip to content

Releases: chengzeyi/ParaAttention

Nightly Release 20250122

22 Jan 03:02
Compare
Choose a tag to compare
Pre-release

TODO: Add nightly release notes

v0.3.10

09 Jan 08:52
Compare
Choose a tag to compare
fix cache accuracy

v0.3.9 Fastest FLUX.1-dev Inference

v0.3.8

03 Jan 09:02
1324733
Compare
Choose a tag to compare
Dev first block cache (#12)

* implement first block cache

* fix

* fix

* fix

* add doc

* fix

* make flux work

* fix

* fix

* fix

* fix

* refactor

* fix

* fix

* Update fastest_hunyuan_video.md

* Update fastest_hunyuan_video.md

* fix

* fix

* fix

* fix

v0.3.7

25 Dec 16:10
Compare
Choose a tag to compare
make hunyuan_video roboster

v0.3.6

25 Dec 15:01
1c83112
Compare
Choose a tag to compare
Update README.md

v0.3.5

19 Dec 05:40
Compare
Choose a tag to compare
remove unnecessary assert

v0.3.4

19 Dec 02:09
e91b702
Compare
Choose a tag to compare

Run HunyuanVideo🚀 with Parallel Inference

NOTE: To run HunyuanVideo, you need to install diffusers from its latest master branch.
It is suggested to run HunyuanVideo with GPUs with 80GB memory, or you might experience OOM errors,
and the performance might be worse due to frequent memory re-allocation.

import torch
import torch.distributed as dist
from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
from diffusers.utils import export_to_video

# RuntimeError: Expected mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good() to be true, but got false.
torch.backends.cuda.enable_cudnn_sdp(False)

dist.init_process_group()

model_id = "tencent/HunyuanVideo"
transformer = HunyuanVideoTransformer3DModel.from_pretrained(
    model_id,
    subfolder="transformer",
    torch_dtype=torch.bfloat16,
    revision="refs/pr/18",
)
pipe = HunyuanVideoPipeline.from_pretrained(
    model_id,
    transformer=transformer,
    torch_dtype=torch.float16,
    revision="refs/pr/18",
).to(f"cuda:{dist.get_rank()}")

pipe.vae.enable_tiling(
    # Make it runnable on GPUs with 48GB memory
    tile_sample_min_height=128,
    tile_sample_stride_height=96,
    tile_sample_min_width=128,
    tile_sample_stride_width=96,
    tile_sample_min_num_frames=32,
    tile_sample_stride_num_frames=24,
)

from para_attn.context_parallel import init_context_parallel_mesh
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
from para_attn.parallel_vae.diffusers_adapters import parallelize_vae

mesh = init_context_parallel_mesh(
    pipe.device.type,
)
parallelize_pipe(
    pipe,
    mesh=mesh,
)
parallelize_vae(pipe.vae, mesh=mesh._flatten())

# Fix OOM because of awful inductor lowering of attn_bias of _scaled_dot_product_efficient_attention
# import para_attn
# para_attn.config.attention.force_dispatch_to_custom_ops = True

# torch._inductor.config.reorder_for_compute_comm_overlap = True
# pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")

output = pipe(
    prompt="A cat walks on the grass, realistic",
    height=320,
    width=512,
    num_frames=61,
    num_inference_steps=30,
    output_type="pil" if dist.get_rank() == 0 else "pt",
).frames[0]

if dist.get_rank() == 0:
    print("Saving video to hunyuan_video.mp4")
    export_to_video(output, "hunyuan_video.mp4", fps=15)

dist.destroy_process_group()

Save the above code to run_hunyuan_video.py and run it with torchrun:

torchrun --nproc_per_node=2 run_hunyuan_video.py

v0.3.3

26 Nov 09:12
Compare
Choose a tag to compare
set output_type to latent for all ranks except rank 0

v0.3.2

19 Nov 15:42
6668e18
Compare
Choose a tag to compare

🚀Support Multi-GPU Parallel Inference Speedup for CogVideoX

Everything works out of the box!

import torch
import torch.distributed as dist
from diffusers import CogVideoXPipeline
from diffusers.utils import export_to_video

dist.init_process_group()

pipe = CogVideoXPipeline.from_pretrained(
    "THUDM/CogVideoX-5b",
    torch_dtype=torch.bfloat16,
).to(f"cuda:{dist.get_rank()}")

# pipe.enable_model_cpu_offload()
# pipe.enable_sequential_cpu_offload()
pipe.vae.enable_slicing()
pipe.vae.enable_tiling()

from para_attn.context_parallel import init_context_parallel_mesh
from para_attn.context_parallel.diffusers_adapters import parallelize_pipe

parallelize_pipe(
    pipe,
    mesh=init_context_parallel_mesh(
        pipe.device.type,
        max_batch_dim_size=2,
        max_ring_dim_size=2,
    ),
)

torch._inductor.config.reorder_for_compute_comm_overlap = True
pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")

prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
video = pipe(
    prompt=prompt,
    num_videos_per_prompt=1,
    num_inference_steps=50,
    num_frames=49,
    guidance_scale=6,
    # generator=torch.Generator(device=pipe.device).manual_seed(42),
).frames[0]

if dist.get_rank() == 0:
    print("Saving video to cogvideox.mp4")
    export_to_video(video, "cogvideox.mp4", fps=8)

dist.destroy_process_group()