Merge pull request #1 from DarthAffe/ParallelConversion

Parallel conversion
This commit is contained in:
DarthAffe 2024-08-11 16:02:53 +02:00 committed by GitHub
commit b9aa24aeec
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 342 additions and 79 deletions

View File

@ -5,6 +5,12 @@ namespace HPPH;
public static unsafe partial class PixelHelper
{
#region Constants
private const int MIN_BATCH_SIZE = 8;
#endregion
#region Methods
public static Span<TTarget> ConvertInPlace<TSource, TTarget>(this Span<TSource> colors)
@ -105,6 +111,8 @@ public static unsafe partial class PixelHelper
private static void Convert3Bytes(ReadOnlySpan<byte> source, Span<byte> target, IColorFormat sourceFormat, IColorFormat targetFormat)
{
const int BPP = 3;
ReadOnlySpan<byte> sourceMapping = sourceFormat.ByteMapping;
ReadOnlySpan<byte> targetMapping = targetFormat.ByteMapping;
@ -133,12 +141,97 @@ public static unsafe partial class PixelHelper
15
];
Vector128<byte> maskVector = Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(mask));
ConvertSameBpp(source, target, mask, 3);
int elements = source.Length / BPP;
int elementsPerVector = Vector128<byte>.Count / BPP;
int bytesPerVector = elementsPerVector * BPP;
int chunks = elements / elementsPerVector;
int batches = Math.Max(1, Math.Min(chunks / MIN_BATCH_SIZE, Environment.ProcessorCount));
int batchSize = elements / batches;
fixed (byte* fixedSourcePtr = source)
fixed (byte* fixedTargetPtr = target)
{
byte* sourcePtr = fixedSourcePtr;
byte* targetPtr = fixedTargetPtr;
if (batches == 1)
{
byte* src = sourcePtr;
byte* tar = targetPtr;
int chunkCount = Math.Max(0, (batchSize / elementsPerVector) - 1);
int missingElements = batchSize - (chunkCount * elementsPerVector);
for (int i = 0; i < chunkCount; i++)
{
Vector128<byte> vector = Vector128.Load(src);
Vector128.Shuffle(vector, maskVector).Store(tar);
src += bytesPerVector;
tar += bytesPerVector;
}
for (int i = 0; i < missingElements; i++)
{
tar[(i * BPP) + 0] = src[(i * BPP) + maskVector[0]];
tar[(i * BPP) + 1] = src[(i * BPP) + maskVector[1]];
tar[(i * BPP) + 2] = src[(i * BPP) + maskVector[2]];
}
}
else
{
Parallel.For(0, batches, Process);
int missing = elements - (batchSize * batches);
if (missing > 0)
{
byte* missingSrc = sourcePtr + (batches * batchSize * BPP);
byte* missingTar = targetPtr + (batches * batchSize * BPP);
for (int i = 0; i < missing; i++)
{
missingTar[(i * BPP) + 0] = missingSrc[(i * BPP) + maskVector[0]];
missingTar[(i * BPP) + 1] = missingSrc[(i * BPP) + maskVector[1]];
missingTar[(i * BPP) + 2] = missingSrc[(i * BPP) + maskVector[2]];
}
}
void Process(int index)
{
int offset = index * batchSize;
byte* src = sourcePtr + (offset * BPP);
byte* tar = targetPtr + (offset * BPP);
int chunkCount = Math.Max(0, (batchSize / elementsPerVector) - 1);
int missingElements = batchSize - (chunkCount * elementsPerVector);
for (int i = 0; i < chunkCount; i++)
{
Vector128<byte> vector = Vector128.Load(src);
Vector128.Shuffle(vector, maskVector).Store(tar);
src += bytesPerVector;
tar += bytesPerVector;
}
for (int i = 0; i < missingElements; i++)
{
tar[(i * BPP) + 0] = src[(i * BPP) + maskVector[0]];
tar[(i * BPP) + 1] = src[(i * BPP) + maskVector[1]];
tar[(i * BPP) + 2] = src[(i * BPP) + maskVector[2]];
}
}
}
}
}
private static void Convert4Bytes(ReadOnlySpan<byte> source, Span<byte> target, IColorFormat sourceFormat, IColorFormat targetFormat)
{
const int BPP = 4;
ReadOnlySpan<byte> sourceMapping = sourceFormat.ByteMapping;
ReadOnlySpan<byte> targetMapping = targetFormat.ByteMapping;
@ -166,26 +259,31 @@ public static unsafe partial class PixelHelper
(byte)(mapping[3] + 12),
];
ConvertSameBpp(source, target, mask, 4);
}
private static void ConvertSameBpp(ReadOnlySpan<byte> source, Span<byte> target, ReadOnlySpan<byte> mask, int bpp)
{
int elementsPerVector = Vector128<byte>.Count / bpp;
int bytesPerVector = elementsPerVector * bpp;
int chunks = source.Length / bytesPerVector;
Vector128<byte> maskVector = Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(mask));
int missingElements = (source.Length - (chunks * bytesPerVector)) / bpp;
int elements = source.Length / BPP;
int elementsPerVector = Vector128<byte>.Count / BPP;
int bytesPerVector = elementsPerVector * BPP;
fixed (byte* sourcePtr = source)
fixed (byte* targetPtr = target)
int chunks = elements / elementsPerVector;
int batches = Math.Max(1, Math.Min(chunks / MIN_BATCH_SIZE, Environment.ProcessorCount));
int batchSize = elements / batches;
fixed (byte* fixedSourcePtr = source)
fixed (byte* fixedTargetPtr = target)
{
byte* sourcePtr = fixedSourcePtr;
byte* targetPtr = fixedTargetPtr;
if (batches == 1)
{
byte* src = sourcePtr;
byte* tar = targetPtr;
for (int i = 0; i < chunks; i++)
int chunkCount = batchSize / elementsPerVector;
int missingElements = batchSize - (chunkCount * elementsPerVector);
for (int i = 0; i < chunkCount; i++)
{
Vector128<byte> vector = Vector128.Load(src);
Vector128.Shuffle(vector, maskVector).Store(tar);
@ -194,21 +292,73 @@ public static unsafe partial class PixelHelper
tar += bytesPerVector;
}
Span<byte> buffer = stackalloc byte[missingElements * bpp]; // DarthAffe 08.07.2024: This is fine as it's always < 16 bytes
for (int j = 0; j < buffer.Length; j++)
buffer[j] = src[mask[j]];
for (int i = 0; i < missingElements; i++)
{
tar[(i * BPP) + 0] = src[(i * BPP) + maskVector[0]];
tar[(i * BPP) + 1] = src[(i * BPP) + maskVector[1]];
tar[(i * BPP) + 2] = src[(i * BPP) + maskVector[2]];
tar[(i * BPP) + 3] = src[(i * BPP) + maskVector[3]];
}
}
else
{
Parallel.For(0, batches, Process);
buffer.CopyTo(new Span<byte>(tar, buffer.Length));
int missing = elements - (batchSize * batches);
if (missing > 0)
{
byte* missingSrc = sourcePtr + (batches * batchSize * BPP);
byte* missingTar = targetPtr + (batches * batchSize * BPP);
for (int i = 0; i < missing; i++)
{
missingTar[(i * BPP) + 0] = missingSrc[(i * BPP) + maskVector[0]];
missingTar[(i * BPP) + 1] = missingSrc[(i * BPP) + maskVector[1]];
missingTar[(i * BPP) + 2] = missingSrc[(i * BPP) + maskVector[2]];
missingTar[(i * BPP) + 3] = missingSrc[(i * BPP) + maskVector[3]];
}
}
void Process(int index)
{
int offset = index * batchSize;
byte* src = sourcePtr + (offset * BPP);
byte* tar = targetPtr + (offset * BPP);
int chunkCount = batchSize / elementsPerVector;
int missingElements = batchSize - (chunkCount * elementsPerVector);
for (int i = 0; i < chunkCount; i++)
{
Vector128<byte> vector = Vector128.Load(src);
Vector128.Shuffle(vector, maskVector).Store(tar);
src += bytesPerVector;
tar += bytesPerVector;
}
for (int i = 0; i < missingElements; i++)
{
tar[(i * BPP) + 0] = src[(i * BPP) + maskVector[0]];
tar[(i * BPP) + 1] = src[(i * BPP) + maskVector[1]];
tar[(i * BPP) + 2] = src[(i * BPP) + maskVector[2]];
tar[(i * BPP) + 3] = src[(i * BPP) + maskVector[3]];
}
}
}
}
}
private static void ConvertWiden3To4Bytes(ReadOnlySpan<byte> source, Span<byte> target, IColorFormat sourceFormat, IColorFormat targetFormat)
{
const int SOURCE_BPP = 3;
const int TARGET_BPP = 4;
ReadOnlySpan<byte> sourceMapping = sourceFormat.ByteMapping;
ReadOnlySpan<byte> targetMapping = targetFormat.ByteMapping;
// DarthAffe 08.07.2024: For now alpha is the only thing to be added
Span<byte> isAlpha =
byte[] isAlpha =
[
targetMapping[0] == Color.A ? byte.MaxValue : (byte)0,
targetMapping[1] == Color.A ? byte.MaxValue : (byte)0,
@ -270,26 +420,33 @@ public static unsafe partial class PixelHelper
isAlpha[3],
];
int sourceBpp = sourceFormat.BytesPerPixel;
int targetBpp = targetFormat.BytesPerPixel;
int targetElementsPerVector = Vector128<byte>.Count / targetBpp;
int targetBytesPerVector = targetElementsPerVector * targetBpp;
int sourceBytesPerVector = targetElementsPerVector * sourceBpp;
int chunks = (source.Length / sourceBytesPerVector);
Vector128<byte> maskVector = Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(mask));
Vector128<byte> alphaMaskVector = Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(alphaMask));
int missingElements = (source.Length - (chunks * sourceBytesPerVector)) / sourceBpp;
int elements = source.Length / SOURCE_BPP;
int targetElementsPerVector = Vector128<byte>.Count / TARGET_BPP;
int sourceBytesPerVector = targetElementsPerVector * SOURCE_BPP;
int targetBytesPerVector = targetElementsPerVector * TARGET_BPP;
fixed (byte* sourcePtr = source)
fixed (byte* targetPtr = target)
int chunks = elements / targetElementsPerVector;
int batches = Math.Max(1, Math.Min(chunks / MIN_BATCH_SIZE, Environment.ProcessorCount));
int batchSize = elements / batches;
fixed (byte* fixedSourcePtr = source)
fixed (byte* fixedTargetPtr = target)
{
byte* sourcePtr = fixedSourcePtr;
byte* targetPtr = fixedTargetPtr;
if (batches == 1)
{
byte* src = sourcePtr;
byte* tar = targetPtr;
for (int i = 0; i < chunks; i++)
int chunkCount = batchSize / targetElementsPerVector;
int missingElements = batchSize - (chunkCount * targetElementsPerVector);
for (int i = 0; i < chunkCount; i++)
{
Vector128<byte> vector = Vector128.Load(src);
Vector128<byte> shuffled = Vector128.Shuffle(vector, maskVector);
@ -299,17 +456,69 @@ public static unsafe partial class PixelHelper
tar += targetBytesPerVector;
}
Span<byte> buffer = stackalloc byte[missingElements * targetBpp]; // DarthAffe 08.07.2024: This is fine as it's always < 16 bytes
for (int i = 0; i < missingElements; i++)
for (int j = 0; j < targetBpp; j++)
buffer[(i * targetBpp) + j] = Math.Max(isAlpha[j], src[(i * sourceBpp) + mask[j]]);
{
tar[(i * TARGET_BPP) + 0] = Math.Max(isAlpha[0], src[(i * SOURCE_BPP) + maskVector[0]]);
tar[(i * TARGET_BPP) + 1] = Math.Max(isAlpha[1], src[(i * SOURCE_BPP) + maskVector[1]]);
tar[(i * TARGET_BPP) + 2] = Math.Max(isAlpha[2], src[(i * SOURCE_BPP) + maskVector[2]]);
tar[(i * TARGET_BPP) + 3] = Math.Max(isAlpha[3], src[(i * SOURCE_BPP) + maskVector[3]]);
}
}
else
{
Parallel.For(0, batches, Process);
buffer.CopyTo(new Span<byte>(tar, buffer.Length));
int missing = elements - (batchSize * batches);
if (missing > 0)
{
byte* missingSrc = sourcePtr + (batches * batchSize * SOURCE_BPP);
byte* missingTar = targetPtr + (batches * batchSize * TARGET_BPP);
for (int i = 0; i < missing; i++)
{
missingTar[(i * TARGET_BPP) + 0] = Math.Max(isAlpha[0], missingSrc[(i * SOURCE_BPP) + maskVector[0]]);
missingTar[(i * TARGET_BPP) + 1] = Math.Max(isAlpha[1], missingSrc[(i * SOURCE_BPP) + maskVector[1]]);
missingTar[(i * TARGET_BPP) + 2] = Math.Max(isAlpha[2], missingSrc[(i * SOURCE_BPP) + maskVector[2]]);
missingTar[(i * TARGET_BPP) + 3] = Math.Max(isAlpha[3], missingSrc[(i * SOURCE_BPP) + maskVector[3]]);
}
}
void Process(int index)
{
int offset = index * batchSize;
byte* src = sourcePtr + (offset * SOURCE_BPP);
byte* tar = targetPtr + (offset * TARGET_BPP);
int chunkCount = batchSize / targetElementsPerVector;
int missingElements = batchSize - (chunkCount * targetElementsPerVector);
for (int i = 0; i < chunkCount; i++)
{
Vector128<byte> vector = Vector128.Load(src);
Vector128<byte> shuffled = Vector128.Shuffle(vector, maskVector);
Vector128.BitwiseOr(shuffled, alphaMaskVector).Store(tar);
src += sourceBytesPerVector;
tar += targetBytesPerVector;
}
for (int i = 0; i < missingElements; i++)
{
tar[(i * TARGET_BPP) + 0] = Math.Max(isAlpha[0], src[(i * SOURCE_BPP) + maskVector[0]]);
tar[(i * TARGET_BPP) + 1] = Math.Max(isAlpha[1], src[(i * SOURCE_BPP) + maskVector[1]]);
tar[(i * TARGET_BPP) + 2] = Math.Max(isAlpha[2], src[(i * SOURCE_BPP) + maskVector[2]]);
tar[(i * TARGET_BPP) + 3] = Math.Max(isAlpha[3], src[(i * SOURCE_BPP) + maskVector[3]]);
}
}
}
}
}
private static void ConvertNarrow4To3Bytes(ReadOnlySpan<byte> source, Span<byte> target, IColorFormat sourceFormat, IColorFormat targetFormat)
{
const int SOURCE_BPP = 4;
const int TARGET_BPP = 3;
ReadOnlySpan<byte> sourceMapping = sourceFormat.ByteMapping;
ReadOnlySpan<byte> targetMapping = targetFormat.ByteMapping;
@ -340,25 +549,32 @@ public static unsafe partial class PixelHelper
15
];
int sourceBpp = sourceFormat.BytesPerPixel;
int targetBpp = targetFormat.BytesPerPixel;
int sourceElementsPerVector = Vector128<byte>.Count / sourceBpp;
int sourceBytesPerVector = sourceElementsPerVector * sourceBpp;
int targetBytesPerVector = sourceElementsPerVector * targetBpp;
int chunks = (source.Length / sourceBytesPerVector) - 1; // DarthAffe 08.07.2024: -1 since we don't have enough space to copy a full target vector for the last set
Vector128<byte> maskVector = Vector128.LoadUnsafe(ref MemoryMarshal.GetReference(mask));
int missingElements = (source.Length - (chunks * sourceBytesPerVector)) / sourceBpp;
int elements = source.Length / SOURCE_BPP;
int sourceElementsPerVector = Vector128<byte>.Count / SOURCE_BPP;
int sourceBytesPerVector = sourceElementsPerVector * SOURCE_BPP;
int targetBytesPerVector = sourceElementsPerVector * TARGET_BPP;
fixed (byte* sourcePtr = source)
fixed (byte* targetPtr = target)
int chunks = elements / sourceElementsPerVector;
int batches = Math.Max(1, Math.Min(chunks / MIN_BATCH_SIZE, Environment.ProcessorCount));
int batchSize = elements / batches;
fixed (byte* fixedSourcePtr = source)
fixed (byte* fixedTargetPtr = target)
{
byte* sourcePtr = fixedSourcePtr;
byte* targetPtr = fixedTargetPtr;
if (batches == 1)
{
byte* src = sourcePtr;
byte* tar = targetPtr;
for (int i = 0; i < chunks; i++)
int chunkCount = Math.Max(0, (batchSize / sourceElementsPerVector) - 1); // DarthAffe 08.07.2024: -1 since we don't have enough space to copy a full target vector for the last set
int missingElements = batchSize - (chunkCount * sourceElementsPerVector);
for (int i = 0; i < chunkCount; i++)
{
Vector128<byte> vector = Vector128.Load(src);
Vector128.Shuffle(vector, maskVector).Store(tar);
@ -367,12 +583,57 @@ public static unsafe partial class PixelHelper
tar += targetBytesPerVector;
}
Span<byte> buffer = stackalloc byte[missingElements * targetBpp]; // DarthAffe 08.07.2024: This is fine as it's always < 24 bytes
for (int i = 0; i < missingElements; i++)
for (int j = 0; j < targetBpp; j++)
buffer[(i * targetBpp) + j] = src[(i * sourceBpp) + mask[j]];
{
tar[(i * TARGET_BPP) + 0] = src[(i * SOURCE_BPP) + mapping[0]];
tar[(i * TARGET_BPP) + 1] = src[(i * SOURCE_BPP) + mapping[1]];
tar[(i * TARGET_BPP) + 2] = src[(i * SOURCE_BPP) + mapping[2]];
}
}
else
{
Parallel.For(0, batches, Process);
buffer.CopyTo(new Span<byte>(tar, buffer.Length));
int missing = elements - (batchSize * batches);
if (missing > 0)
{
byte* missingSrc = sourcePtr + (batches * batchSize * SOURCE_BPP);
byte* missingTar = targetPtr + (batches * batchSize * TARGET_BPP);
for (int i = 0; i < missing; i++)
{
missingTar[(i * TARGET_BPP) + 0] = missingSrc[(i * SOURCE_BPP) + maskVector[0]];
missingTar[(i * TARGET_BPP) + 1] = missingSrc[(i * SOURCE_BPP) + maskVector[1]];
missingTar[(i * TARGET_BPP) + 2] = missingSrc[(i * SOURCE_BPP) + maskVector[2]];
}
}
void Process(int index)
{
int offset = index * batchSize;
byte* src = sourcePtr + (offset * SOURCE_BPP);
byte* tar = targetPtr + (offset * TARGET_BPP);
int chunkCount = Math.Max(0, (batchSize / sourceElementsPerVector) - 1); // DarthAffe 08.07.2024: -1 since we don't have enough space to copy a full target vector for the last set
int missingElements = batchSize - (chunkCount * sourceElementsPerVector);
for (int i = 0; i < chunkCount; i++)
{
Vector128<byte> vector = Vector128.Load(src);
Vector128.Shuffle(vector, maskVector).Store(tar);
src += sourceBytesPerVector;
tar += targetBytesPerVector;
}
for (int i = 0; i < missingElements; i++)
{
tar[(i * TARGET_BPP) + 0] = src[(i * SOURCE_BPP) + maskVector[0]];
tar[(i * TARGET_BPP) + 1] = src[(i * SOURCE_BPP) + maskVector[1]];
tar[(i * TARGET_BPP) + 2] = src[(i * SOURCE_BPP) + maskVector[2]];
}
}
}
}
}

View File

@ -98,8 +98,10 @@ All of the included formats can freely be converted between each other.
Allocation-free in-place conversion is only supported for formats of same size (both 24 or 32 bit).
| Method | Mean | Error | StdDev | Allocated |
|----------- |---------:|----------:|----------:|----------:|
| RGBToBGR | 6.272 ms | 0.0288 ms | 0.0240 ms | 8.81 MB |
| RGBToBGRA | 8.534 ms | 0.0684 ms | 0.0640 ms | 11.75 MB |
| RGBAToABGR | 8.128 ms | 0.0927 ms | 0.0867 ms | 11.75 MB |
| ARGBToBGR | 8.004 ms | 0.0353 ms | 0.0313 ms | 8.81 MB |
|------------------- |---------:|----------:|----------:|------------:|
| RGBToBGR | 1.487 ms | 0.0221 ms | 0.0196 ms | 9073.58 KB |
| RGBToBGRA | 1.676 ms | 0.0330 ms | 0.0353 ms | 12064.76 KB |
| RGBAToABGR | 1.766 ms | 0.0348 ms | 0.0476 ms | 12084.93 KB |
| ARGBToBGR | 1.533 ms | 0.0072 ms | 0.0064 ms | 9085.36 KB |
| RGBToBGR_InPlace | 1.025 ms | 0.0021 ms | 0.0017 ms | 34.47 KB |
| RGBAToABGR_InPlace | 1.054 ms | 0.0023 ms | 0.0020 ms | 34.16 KB |