Skip to content

Commit

Permalink
Benchmarks using Tensor Primitives.
Browse files Browse the repository at this point in the history
  • Loading branch information
thygrrr committed Nov 2, 2024
1 parent ce1c720 commit 0495aed
Show file tree
Hide file tree
Showing 5 changed files with 478 additions and 31 deletions.
86 changes: 63 additions & 23 deletions fennecs.benchmarks/ECS/DorakuBenchmarks.cs
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
using System.Runtime.Intrinsics;
using System.Numerics.Tensors;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.Arm;
using System.Runtime.Intrinsics.X86;
using BenchmarkDotNet.Attributes;
using BenchmarkDotNet.Order;
using CommandLine;
using fennecs;
using fennecs_Components;
using fennecs.pools;
Expand Down Expand Up @@ -88,15 +91,15 @@ public void fennecs_For()


[BenchmarkCategory("fennecs")]
[Benchmark(Description = "fennecs (For WL)")]
//[Benchmark(Description = "fennecs (For WL)")]
public void fennecs_For_WL()
{
_query.For(Workload);
}


[BenchmarkCategory("fennecs")]
[Benchmark(Description = $"fennecs (Job)")]
//[Benchmark(Description = $"fennecs (Job)")]
public void fennecs_Job()
{
_query.Job(static delegate (ref Component1 c1, ref Component2 c2, ref Component3 c3) { c1.Value = c1.Value + c2.Value + c3.Value; });
Expand All @@ -110,7 +113,7 @@ public void fennecs_Raw()
}

[BenchmarkCategory("fennecs")]
[Benchmark(Description = "fennecs (Raw U4)")]
//[Benchmark(Description = "fennecs (Raw U4)")]
public void fennecs_Raw_Unroll4()
{
// fennecs guarantees contiguous memory access in the form of Query<>.Raw(MemoryAction<>)
Expand All @@ -126,7 +129,7 @@ public void fennecs_Raw_Unroll4()
}

[BenchmarkCategory("fennecs")]
[Benchmark(Description = "fennecs (Raw U8)")]
//[Benchmark(Description = "fennecs (Raw U8)")]
public void fennecs_Raw_Unroll8()
{
// fennecs guarantees contiguous memory access in the form of Query<>.Raw(MemoryAction<>)
Expand Down Expand Up @@ -157,6 +160,23 @@ public void fennecs_Raw_AVX2()
_query.Raw(Raw_Workload_AVX2);
}


[BenchmarkCategory("fennecs", "Tensor")]
[Benchmark(Description = "fennecs (Raw Tensor)")]
public void fennecs_Raw_Tenor()
{
// fennecs guarantees contiguous memory access in the form of Query<>.Raw(MemoryAction<>)
// Raw runners are intended to process data or transfer it via the fastest available means,
// Example use cases:
// - transfer buffers to/from GPUs or Game Engines
// - Disk, Database, or Network I/O
// - SIMD calculations
// - snapshotting / copying / rollback / compression / decompression / diffing / permutation

// As example / reference & benchmark, we vectorized our calculation here using AVX2
_query.Raw(Raw_Workload_Tensor);
}

[BenchmarkCategory("fennecs", nameof(Sse2))]
[Benchmark(Description = "fennecs (Raw SSE2)")]
public void fennecs_Raw_SSE2()
Expand Down Expand Up @@ -268,18 +288,18 @@ private static void Raw_Workload_AVX2(Memory<Component1> c1V, Memory<Component2>

unsafe
{
var p1 = (int*) mem1.Pointer;
var p2 = (int*) mem2.Pointer;
var p3 = (int*) mem3.Pointer;
var p1 = (float*) mem1.Pointer;
var p2 = (float*) mem2.Pointer;
var p3 = (float*) mem3.Pointer;

var vectorSize = Vector256<int>.Count;
var vectorSize = Vector256<float>.Count;
var vectorEnd = count - count % vectorSize;
for (var i = 0; i <= vectorEnd; i += vectorSize)
{
var v1 = Avx.LoadVector256(p1 + i);
var v2 = Avx.LoadVector256(p2 + i);
var v3 = Avx.LoadVector256(p3 + i);
var sum = Avx2.Add(v1, Avx2.Add(v2, v3));
var sum = Avx.Add(v1, Avx.Add(v2, v3));

Avx.Store(p1 + i, sum);
}
Expand All @@ -291,6 +311,26 @@ private static void Raw_Workload_AVX2(Memory<Component1> c1V, Memory<Component2>
}
}

private static void Raw_Workload_Tensor(Memory<Component1> c1V, Memory<Component2> c2V, Memory<Component3> c3V)
{

var c1I = MemoryMarshal.Cast<Component1, float>(c1V.Span);
var c2I = MemoryMarshal.Cast<Component2, float>(c2V.Span);
var c3I = MemoryMarshal.Cast<Component3, float>(c3V.Span);

/*
var c1I = c1V.Span;
var c2I = c2V.Span;
var c3I = c3V.Span;
*/

//stackalloc float array
//Span<float> intermediate = stackalloc float[c1V.Length];

TensorPrimitives.Add(c2I, c1I, c1I);
TensorPrimitives.Add(c3I, c1I, c1I);
}

private static void Raw_Workload_SSE2(Memory<Component1> c1V, Memory<Component2> c2V, Memory<Component3> c3V)
{
(int Item1, int Item2) range = (0, c1V.Length);
Expand All @@ -301,21 +341,21 @@ private static void Raw_Workload_SSE2(Memory<Component1> c1V, Memory<Component2>

unsafe
{
var p1 = (int*) mem1.Pointer;
var p2 = (int*) mem2.Pointer;
var p3 = (int*) mem3.Pointer;
var p1 = (float*) mem1.Pointer;
var p2 = (float*) mem2.Pointer;
var p3 = (float*) mem3.Pointer;

var vectorSize = Vector128<int>.Count;
var vectorSize = Vector128<float>.Count;
var i = range.Item1;
var vectorEnd = range.Item2 - vectorSize;
for (; i <= vectorEnd; i += vectorSize)
{
var v1 = Sse2.LoadVector128(p1 + i);
var v2 = Sse2.LoadVector128(p2 + i);
var v3 = Sse2.LoadVector128(p3 + i);
var sum = Sse2.Add(v1, Sse2.Add(v2, v3));
var v1 = Sse.LoadVector128(p1 + i);
var v2 = Sse.LoadVector128(p2 + i);
var v3 = Sse.LoadVector128(p3 + i);
var sum = Sse.Add(v1, Sse.Add(v2, v3));

Sse2.Store(p1 + i, sum);
Sse.Store(p1 + i, sum);
}

for (; i < range.Item2; i++) // remaining elements
Expand All @@ -335,11 +375,11 @@ private static void Raw_Workload_AdvSIMD(Memory<Component1> c1V, Memory<Componen

unsafe
{
var p1 = (int*) mem1.Pointer;
var p2 = (int*) mem2.Pointer;
var p3 = (int*) mem3.Pointer;
var p1 = (float*) mem1.Pointer;
var p2 = (float*) mem2.Pointer;
var p3 = (float*) mem3.Pointer;

var vectorSize = Vector128<int>.Count;
var vectorSize = Vector128<float>.Count;
var i = range.Item1;
var vectorEnd = range.Item2 - vectorSize;
for (; i <= vectorEnd; i += vectorSize)
Expand Down
11 changes: 5 additions & 6 deletions fennecs.benchmarks/ECS/FennecsBenchmarkComponents.cs
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
using fennecs;
using System.Numerics;
using fennecs;

namespace fennecs_Components
{
internal record struct Component1(int Value);

internal record struct Component2(int Value);

internal record struct Component3(int Value);
internal record struct Component1(float Value);
internal record struct Component2(float Value);
internal record struct Component3(float Value);
}

Loading

0 comments on commit 0495aed

Please sign in to comment.