using System.Runtime.InteropServices; using System.Runtime.Intrinsics.X86; using System.Runtime.Intrinsics; using System.Buffers; using System.Runtime.CompilerServices; namespace HPPH; public static unsafe partial class PixelHelper { #region Methods public static ISum Sum(this IImage image) { ArgumentNullException.ThrowIfNull(image); int dataLength = image.SizeInBytes; if (dataLength <= 1024) { Span buffer = stackalloc byte[dataLength]; image.CopyTo(buffer); return image.ColorFormat.Sum(buffer); } else { byte[] array = ArrayPool.Shared.Rent(dataLength); Span buffer = array.AsSpan()[..dataLength]; try { image.CopyTo(buffer); return image.ColorFormat.Sum(buffer); } finally { ArrayPool.Shared.Return(array); } } } public static ISum Sum(this IImage image) where T : struct, IColor => image.AsRefImage().Sum(); public static ISum Sum(this RefImage image) where T : struct, IColor { int dataLength = image.Width * image.Height; int sizeInBytes = dataLength * T.ColorFormat.BytesPerPixel; if (sizeInBytes <= 1024) { Span buffer = MemoryMarshal.Cast(stackalloc byte[sizeInBytes]); image.CopyTo(buffer); return Sum(buffer); } else { T[] array = ArrayPool.Shared.Rent(dataLength); Span buffer = array.AsSpan()[..(dataLength)]; try { image.CopyTo(buffer); return Sum(buffer); } finally { ArrayPool.Shared.Return(array); } } } public static ISum Sum(this ReadOnlySpan colors) where T : struct, IColor => T.ColorFormat.Sum(MemoryMarshal.AsBytes(colors)); public static ISum Sum(this Span colors) where T : struct, IColor => T.ColorFormat.Sum(MemoryMarshal.AsBytes(colors)); internal static ISum Sum(ReadOnlySpan colors) where T : struct, IColor where TSum : struct, ISum { if (colors == null) throw new ArgumentNullException(nameof(colors)); return T.ColorFormat.BytesPerPixel switch { // DarthAffe 05.07.2024: Important: The sum of 3-byte colors result in 4 byte data! 3 => Unsafe.BitCast(Sum(MemoryMarshal.Cast(colors))), 4 => Unsafe.BitCast(Sum(MemoryMarshal.Cast(colors))), _ => throw new NotSupportedException("Data is not of a supported valid color-type.") }; } private static Generic4LongData Sum(ReadOnlySpan data) { long b1Sum = 0, b2Sum = 0, b3Sum = 0; const int ELEMENTS_PER_VECTOR = 32; int chunks; if (Avx2.IsSupported && ((chunks = data.Length / ELEMENTS_PER_VECTOR) > 0)) { ReadOnlySpan blendMask1 = [ 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0 ]; ReadOnlySpan blendMask2 = [ 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255 ]; ReadOnlySpan blendMask3 = [ 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0, 255, 0, 0 ]; Vector256 blend1MaskVector = Vector256.LoadUnsafe(ref MemoryMarshal.GetReference(blendMask1)); Vector256 blend2MaskVector = Vector256.LoadUnsafe(ref MemoryMarshal.GetReference(blendMask2)); Vector256 blend3MaskVector = Vector256.LoadUnsafe(ref MemoryMarshal.GetReference(blendMask3)); Vector256 b1SumVector = Vector256.Zero; Vector256 b2SumVector = Vector256.Zero; Vector256 b3SumVector = Vector256.Zero; int missingElements = data.Length - (chunks * ELEMENTS_PER_VECTOR); ReadOnlySpan dataBytes = MemoryMarshal.AsBytes(data); fixed (byte* bytePtr = dataBytes) { for (int i = 0; i < chunks; i++) { byte* basePtr = bytePtr + (i * 96); Vector256 data1 = Vector256.Load(basePtr); Vector256 data2 = Vector256.Load(basePtr + 32); Vector256 data3 = Vector256.Load(basePtr + 64); Vector256 vectorB1Blend1 = Avx2.BlendVariable(data2, data1, blend1MaskVector); Vector256 vectorB2Blend1 = Avx2.BlendVariable(data2, data1, blend2MaskVector); Vector256 vectorB3Blend1 = Avx2.BlendVariable(data2, data1, blend3MaskVector); Vector256 vectorB1Blend2 = Avx2.BlendVariable(vectorB1Blend1, data3, blend3MaskVector); Vector256 vectorB2Blend2 = Avx2.BlendVariable(vectorB2Blend1, data3, blend1MaskVector); Vector256 vectorB3Blend2 = Avx2.BlendVariable(vectorB3Blend1, data3, blend2MaskVector); Vector256 sumB1 = Avx2.SumAbsoluteDifferences(vectorB1Blend2, Vector256.Zero).AsInt64(); Vector256 sumB2 = Avx2.SumAbsoluteDifferences(vectorB2Blend2, Vector256.Zero).AsInt64(); Vector256 sumB3 = Avx2.SumAbsoluteDifferences(vectorB3Blend2, Vector256.Zero).AsInt64(); b1SumVector = Avx2.Add(b1SumVector, sumB1); b2SumVector = Avx2.Add(b2SumVector, sumB2); b3SumVector = Avx2.Add(b3SumVector, sumB3); } } b1Sum = b1SumVector[0] + b1SumVector[1] + b1SumVector[2] + b1SumVector[3]; b2Sum = b2SumVector[0] + b2SumVector[1] + b2SumVector[2] + b2SumVector[3]; b3Sum = b3SumVector[0] + b3SumVector[1] + b3SumVector[2] + b3SumVector[3]; for (int i = 0; i < missingElements; i++) { Generic3ByteData d = data[^(i + 1)]; b1Sum += d.B1; b2Sum += d.B2; b3Sum += d.B3; } } else { foreach (Generic3ByteData d in data) { b1Sum += d.B1; b2Sum += d.B2; b3Sum += d.B3; } } return new Generic4LongData(b1Sum, b2Sum, b3Sum, data.Length * 255); } private static Generic4LongData Sum(ReadOnlySpan data) { long b1Sum, b2Sum, b3Sum, b4Sum; int i = 0; if (Avx2.IsSupported && (data.Length >= 8)) { ReadOnlySpan avx2ShuffleMask = [ // Byte 1 15, 11, 7, 3, // Byte 2 14, 10, 6, 2, // Byte 3 13, 9, 5, 1, // Byte 4 12, 8, 4, 0 ]; ReadOnlySpan avx2ControlData = [ // Byte 1 7, 3, // Byte 2 6, 2, // Byte 3 5, 1, // Byte 4 4, 0 ]; Vector256 controlVector = Vector256.LoadUnsafe(ref MemoryMarshal.GetReference(avx2ControlData)); Vector256 rgbaSum64 = Vector256.Zero; ReadOnlySpan dataBytes = MemoryMarshal.AsBytes(data); fixed (byte* bytePtr = dataBytes) fixed (byte* maskPtr = avx2ShuffleMask) { Vector256 avx2ShuffleMaskVector = Avx2.BroadcastVector128ToVector256(maskPtr); for (int j = 0; j < (data.Length / 8); j++, i += 8) { Vector256 chunk = Vector256.Load(bytePtr + (i * 4)); Vector256 deinterleaved = Avx2.Shuffle(chunk, avx2ShuffleMaskVector); Vector256 deinterleaved2 = Avx2.PermuteVar8x32(deinterleaved.AsInt32(), controlVector); Vector256 sum = Avx2.SumAbsoluteDifferences(deinterleaved2.AsByte(), Vector256.Zero).AsInt64(); rgbaSum64 = Avx2.Add(rgbaSum64, sum); } } Vector128 b1B2Sum = rgbaSum64.GetLower(); Vector128 b3B4Sum = rgbaSum64.GetUpper(); b1Sum = b1B2Sum.GetLower()[0]; b2Sum = b1B2Sum.GetUpper()[0]; b3Sum = b3B4Sum.GetLower()[0]; b4Sum = b3B4Sum.GetUpper()[0]; } else { b1Sum = b2Sum = b3Sum = b4Sum = 0; } for (; i < data.Length; i++) { b1Sum += data[i].B1; b2Sum += data[i].B2; b3Sum += data[i].B3; b4Sum += data[i].B4; } return new Generic4LongData(b1Sum, b2Sum, b3Sum, b4Sum); } #endregion }