2026-06-24 23:26:43 +00:00
13 changed files with 57 additions and 317 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -4,36 +4,17 @@ on:
  workflow_dispatch: # allows manual triggering
    inputs:
      create_release:
-        description: "Create new release"
+        description: 'Create new release'
        required: true
        type: boolean
  push:
    branches:
      - master
      - ci
-    paths:
-      [
-        ".github/workflows/**",
-        "**/CMakeLists.txt",
-        "**/Makefile",
-        "**/*.h",
-        "**/*.hpp",
-        "**/*.c",
-        "**/*.cpp",
-        "**/*.cu",
-      ]
+    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
  pull_request:
    types: [opened, synchronize, reopened]
-    paths:
-      [
-        "**/CMakeLists.txt",
-        "**/Makefile",
-        "**/*.h",
-        "**/*.hpp",
-        "**/*.c",
-        "**/*.cpp",
-        "**/*.cu",
-      ]
+    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']

 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -86,12 +67,12 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
-          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

+
  macOS-latest-cmake:
    runs-on: macos-latest

@ -139,29 +120,29 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
-          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

+
  windows-latest-cmake:
    runs-on: windows-latest

    strategy:
      matrix:
        include:
-          - build: "noavx"
-            defines: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
-          - build: "avx2"
-            defines: "-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
-          - build: "avx"
-            defines: "-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
-          - build: "avx512"
-            defines: "-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON"
-          - build: "cuda12"
-            defines: "-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON"
-          - build: "rocm5.5"
+          - build: 'noavx'
+            defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON'
+          - build: 'avx2'
+            defines: '-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON'
+          - build: 'avx'
+            defines: '-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON'
+          - build: 'avx512'
+            defines: '-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON'
+          - build: 'cuda12'
+            defines: '-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON'
+          - build: 'rocm5.5'
            defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
    steps:
      - name: Clone
@ -175,8 +156,8 @@ jobs:
        if: ${{ matrix.build == 'cuda12' }}
        uses: Jimver/cuda-toolkit@v0.2.11
        with:
-          cuda: "12.2.0"
-          method: "network"
+          cuda: '12.2.0'
+          method: 'network'
          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'

      - name: Install rocm-toolkit
@ -184,7 +165,7 @@ jobs:
        if: ${{ matrix.build == 'rocm5.5' }}
        uses: Cyberhan123/rocm-toolkit@v0.1.0
        with:
-          rocm: "5.5.0"
+          rocm: '5.5.0'

      - name: Install Ninja
        id: install-ninja
@ -250,17 +231,15 @@ jobs:

      - name: Upload Cuda runtime
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
-          name: sd-cudart-sd-bin-win-cu12-x64.zip
          path: |
            cudart-sd-bin-win-cu12-x64.zip

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@v3
        with:
-          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

@ -277,11 +256,7 @@ jobs:
    steps:
      - name: Download artifacts
        id: download-artifact
-        uses: actions/download-artifact@v4
-        with:
-          path: ./artifact
-          pattern: sd-*
-          merge-multiple: true
+        uses: actions/download-artifact@v3

      - name: Get commit hash
        id: commit
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,7 +34,7 @@ option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)

 if(SD_CUBLAS)
 	message("Use CUBLAS as backend stable-diffusion")
-    set(GGML_CUDA ON)
+    set(GGML_CUBLAS ON)
    add_definitions(-DSD_USE_CUBLAS)
 endif()

--- a/clip.hpp
+++ b/clip.hpp
@ -679,8 +679,8 @@ public:
        class_embedding                     = ggml_repeat(ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
        class_embedding                     = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]

-        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
-        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
+        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding);    // [N, num_positions, embed_dim, 1]
+        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);  // [N, num_positions, embed_dim]
        x                     = ggml_add(ctx, x, position_embed_weight);
        return x;  // [N, num_positions, embed_dim]
    }
@ -1036,7 +1036,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
                                             hidden_states2->ne[3]);
            hidden_states2 = ggml_cont(ctx, ggml_permute(ctx, hidden_states2, 2, 0, 1, 3));

-            hidden_states = ggml_concat(ctx, hidden_states, hidden_states2, 2);  // [N, n_token, hidden_size + hidden_size2]
+            hidden_states = ggml_concat(ctx, hidden_states, hidden_states2);  // [N, n_token, hidden_size + hidden_size2]

            hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 1, 2, 0, 3));
        }
@ -1069,7 +1069,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
            auto token_embed_weight = text_model.get_token_embed_weight();
            token_embed_weight      = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
            // concatenate custom embeddings
-            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 2);
+            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings);
            embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
        }

--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -42,13 +42,13 @@ public:
        auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);

        auto x1    = lrelu(ctx, conv1->forward(ctx, x));
-        auto x_cat = ggml_concat(ctx, x, x1, 2);
+        auto x_cat = ggml_concat(ctx, x, x1);
        auto x2    = lrelu(ctx, conv2->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x2, 2);
+        x_cat      = ggml_concat(ctx, x_cat, x2);
        auto x3    = lrelu(ctx, conv3->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x3, 2);
+        x_cat      = ggml_concat(ctx, x_cat, x3);
        auto x4    = lrelu(ctx, conv4->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x4, 2);
+        x_cat      = ggml_concat(ctx, x_cat, x4);
        auto x5    = conv5->forward(ctx, x_cat);

        x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 2aae01fd9b8f9399f343cf18f46f38996ef52e2c
+Subproject commit 57869ad3b7b1f49ae18e3238b0d69a9467a8f068
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -917,7 +917,7 @@ public:
            return NULL;
        }
        // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
+        if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
            // pass input tensors to gpu memory
            auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);

--- a/model.cpp
+++ b/model.cpp
@ -571,9 +571,10 @@ void convert_tensor(void* src,
        if (dst_type == GGML_TYPE_F16) {
            ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
        } else {
+            int64_t hist[16];
            std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
            const float* im = imatrix.data();
-            ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im);
+            ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, hist, im);
        }
    } else if (dst_type == GGML_TYPE_F32) {
        if (src_type == GGML_TYPE_F16) {
@ -601,9 +602,10 @@ void convert_tensor(void* src,
        if (dst_type == GGML_TYPE_F16) {
            ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
        } else {
+            int64_t hist[16];
            std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
            const float* im = imatrix.data();
-            ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im);
+            ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, hist, im);
        }
    }
 }
--- a/pmid.hpp
+++ b/pmid.hpp
@ -64,7 +64,7 @@ public:
        auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
        auto id_embeds0     = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
        // concat is along dim 2
-        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
+        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0);
        stacked_id_embeds      = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));

        // stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
@ -102,12 +102,12 @@ public:

        stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
        if (left && right) {
-            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
-            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds);
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right);
        } else if (left) {
-            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds);
        } else if (right) {
-            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right);
        }
        stacked_id_embeds                         = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
        class_tokens_mask                         = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
@ -146,7 +146,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
        id_embeds   = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
        id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));

-        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
+        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
        id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));

        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -1717,7 +1717,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
    for (int b = 0; b < batch_count; b++) {
        int64_t sampling_start = ggml_time_ms();
        int64_t cur_seed       = seed + b;
-        LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
+        LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed);

        sd_ctx->sd->rng->manual_seed(cur_seed);
        struct ggml_tensor* x_t   = NULL;
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -60,11 +60,12 @@ enum sd_type_t {
    SD_TYPE_Q4_0 = 2,
    SD_TYPE_Q4_1 = 3,
    // SD_TYPE_Q4_2 = 4, support has been removed
-    // SD_TYPE_Q4_3 = 5, support has been removed
-    SD_TYPE_Q5_0    = 6,
-    SD_TYPE_Q5_1    = 7,
-    SD_TYPE_Q8_0    = 8,
-    SD_TYPE_Q8_1    = 9,
+    // SD_TYPE_Q4_3 (5) support has been removed
+    SD_TYPE_Q5_0 = 6,
+    SD_TYPE_Q5_1 = 7,
+    SD_TYPE_Q8_0 = 8,
+    SD_TYPE_Q8_1 = 9,
+    // k-quantizations
    SD_TYPE_Q2_K    = 10,
    SD_TYPE_Q3_K    = 11,
    SD_TYPE_Q4_K    = 12,
@ -79,13 +80,9 @@ enum sd_type_t {
    SD_TYPE_IQ3_S   = 21,
    SD_TYPE_IQ2_S   = 22,
    SD_TYPE_IQ4_XS  = 23,
-    SD_TYPE_I8      = 24,
-    SD_TYPE_I16     = 25,
-    SD_TYPE_I32     = 26,
-    SD_TYPE_I64     = 27,
-    SD_TYPE_F64     = 28,
-    SD_TYPE_IQ1_M   = 29,
-    SD_TYPE_BF16    = 30,
+    SD_TYPE_I8,
+    SD_TYPE_I16,
+    SD_TYPE_I32,
    SD_TYPE_COUNT,
 };

--- a/thirdparty/.clang-format
+++ b/thirdparty/.clang-format
@ -1,2 +0,0 @@
-DisableFormat: true
-SortIncludes: Never
--- a/thirdparty/zip.c
+++ b/thirdparty/zip.c
@ -36,7 +36,6 @@
 #include <unistd.h>
 #endif

-#define USE_EXTERNAL_MZCRC
 #include "miniz.h"
 #include "zip.h"

@ -1835,234 +1834,3 @@ int zip_extract(const char *zipname, const char *dir,

  return zip_archive_extract(&zip_archive, dir, on_extract, arg);
 }
-
-#if defined(__SSE4_2__) || defined(__AVX512F__)
-#include <immintrin.h>
-#endif
-
-// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles
-// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX
-// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
-//  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
-mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) {
-  uint32_t crc = ~init;
-#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__)
-  if (len >= 256) {
-    _Alignas(__m512) static const uint64_t k1k2[] = {
-        0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
-        0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
-    };
-    _Alignas(__m512) static const uint64_t k3k4[] = {
-        0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
-        0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
-    };
-    _Alignas(__m512) static const uint64_t k5k6[] = {
-        0x01751997d0,
-        0x00ccaa009e,
-    };
-    _Alignas(__m512) static const uint64_t k7k8[] = {
-        0x0163cd6124,
-        0x0000000000,
-    };
-    _Alignas(__m512) static const uint64_t poly[] = {
-        0x01db710641,
-        0x01f7011641,
-    };
-    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
-    __m128i a0, a1, a2, a3;
-    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
-    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
-    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
-    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
-    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
-    x0 = _mm512_load_si512((__m512i *)k1k2);
-    buf += 256;
-    len -= 256;
-    while (len >= 256) {
-      x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-      x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
-      x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
-      x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
-      x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-      x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
-      x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
-      x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
-      y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
-      y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
-      y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
-      y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
-      x1 = _mm512_xor_si512(x1, x5);
-      x2 = _mm512_xor_si512(x2, x6);
-      x3 = _mm512_xor_si512(x3, x7);
-      x4 = _mm512_xor_si512(x4, x8);
-      x1 = _mm512_xor_si512(x1, y5);
-      x2 = _mm512_xor_si512(x2, y6);
-      x3 = _mm512_xor_si512(x3, y7);
-      x4 = _mm512_xor_si512(x4, y8);
-      buf += 256;
-      len -= 256;
-    }
-    x0 = _mm512_load_si512((__m512i *)k3k4);
-    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-    x1 = _mm512_xor_si512(x1, x2);
-    x1 = _mm512_xor_si512(x1, x5);
-    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-    x1 = _mm512_xor_si512(x1, x3);
-    x1 = _mm512_xor_si512(x1, x5);
-    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-    x1 = _mm512_xor_si512(x1, x4);
-    x1 = _mm512_xor_si512(x1, x5);
-    while (len >= 64) {
-      x2 = _mm512_loadu_si512((__m512i *)buf);
-      x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-      x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-      x1 = _mm512_xor_si512(x1, x2);
-      x1 = _mm512_xor_si512(x1, x5);
-      buf += 64;
-      len -= 64;
-    }
-    a0 = _mm_load_si128((__m128i *)k5k6);
-    a1 = _mm512_extracti32x4_epi32(x1, 0);
-    a2 = _mm512_extracti32x4_epi32(x1, 1);
-    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
-    a1 = _mm_xor_si128(a1, a3);
-    a1 = _mm_xor_si128(a1, a2);
-    a2 = _mm512_extracti32x4_epi32(x1, 2);
-    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
-    a1 = _mm_xor_si128(a1, a3);
-    a1 = _mm_xor_si128(a1, a2);
-    a2 = _mm512_extracti32x4_epi32(x1, 3);
-    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
-    a1 = _mm_xor_si128(a1, a3);
-    a1 = _mm_xor_si128(a1, a2);
-    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
-    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
-    a1 = _mm_srli_si128(a1, 8);
-    a1 = _mm_xor_si128(a1, a2);
-    a0 = _mm_loadl_epi64((__m128i *)k7k8);
-    a2 = _mm_srli_si128(a1, 4);
-    a1 = _mm_and_si128(a1, a3);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_xor_si128(a1, a2);
-    a0 = _mm_load_si128((__m128i *)poly);
-    a2 = _mm_and_si128(a1, a3);
-    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
-    a2 = _mm_and_si128(a2, a3);
-    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
-    a1 = _mm_xor_si128(a1, a2);
-    crc = _mm_extract_epi32(a1, 1);
-  }
-#endif
-#if defined(__SSE4_2__) && defined(__PCLMUL__)
-  if (len >= 64) {
-    _Alignas(__m128) static const uint64_t k1k2[] = {
-        0x0154442bd4,
-        0x01c6e41596,
-    };
-    _Alignas(__m128) static const uint64_t k3k4[] = {
-        0x01751997d0,
-        0x00ccaa009e,
-    };
-    _Alignas(__m128) static const uint64_t k5k0[] = {
-        0x0163cd6124,
-        0x0000000000,
-    };
-    _Alignas(__m128) static const uint64_t poly[] = {
-        0x01db710641,
-        0x01f7011641,
-    };
-    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
-    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
-    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
-    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
-    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
-    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
-    x0 = _mm_load_si128((__m128i *)k1k2);
-    buf += 64;
-    len -= 64;
-    while (len >= 64) {
-      x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
-      x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
-      x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
-      x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
-      x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
-      x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
-      x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
-      x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
-      y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
-      y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
-      y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
-      y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
-      x1 = _mm_xor_si128(x1, x5);
-      x2 = _mm_xor_si128(x2, x6);
-      x3 = _mm_xor_si128(x3, x7);
-      x4 = _mm_xor_si128(x4, x8);
-      x1 = _mm_xor_si128(x1, y5);
-      x2 = _mm_xor_si128(x2, y6);
-      x3 = _mm_xor_si128(x3, y7);
-      x4 = _mm_xor_si128(x4, y8);
-      buf += 64;
-      len -= 64;
-    }
-    x0 = _mm_load_si128((__m128i *)k3k4);
-    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
-    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
-    x1 = _mm_xor_si128(x1, x2);
-    x1 = _mm_xor_si128(x1, x5);
-    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
-    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
-    x1 = _mm_xor_si128(x1, x3);
-    x1 = _mm_xor_si128(x1, x5);
-    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
-    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
-    x1 = _mm_xor_si128(x1, x4);
-    x1 = _mm_xor_si128(x1, x5);
-    while (len >= 16) {
-      x2 = _mm_loadu_si128((__m128i *)buf);
-      x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
-      x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
-      x1 = _mm_xor_si128(x1, x2);
-      x1 = _mm_xor_si128(x1, x5);
-      buf += 16;
-      len -= 16;
-    }
-    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
-    x3 = _mm_setr_epi32(~0, 0, ~0, 0);
-    x1 = _mm_srli_si128(x1, 8);
-    x1 = _mm_xor_si128(x1, x2);
-    x0 = _mm_loadl_epi64((__m128i *)k5k0);
-    x2 = _mm_srli_si128(x1, 4);
-    x1 = _mm_and_si128(x1, x3);
-    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
-    x1 = _mm_xor_si128(x1, x2);
-    x0 = _mm_load_si128((__m128i *)poly);
-    x2 = _mm_and_si128(x1, x3);
-    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
-    x2 = _mm_and_si128(x2, x3);
-    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
-    x1 = _mm_xor_si128(x1, x2);
-    crc = _mm_extract_epi32(x1, 1);
-  }
-#endif
-  static uint32_t tab[256];
-  if (!tab[255]) {
-    // generates table for byte-wise crc calculation on the polynomial
-    // x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
-    uint32_t polynomial = 0xedb88320;  // bits are reversed
-    for (int d = 0; d < 256; ++d) {
-      uint32_t r = d;
-      for (int i = 0; i < 8; ++i)
-        r = r >> 1 ^ (r & 1 ? polynomial : 0);
-      tab[d] = r;
-    }
-  }
-  for (size_t i = 0; i < len; ++i)
-    crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]];
-  return ~crc & 0xffffffff;
-}
--- a/unet.hpp
+++ b/unet.hpp
@ -396,7 +396,7 @@ public:
            if (c_concat->ne[3] != x->ne[3]) {
                c_concat = ggml_repeat(ctx, c_concat, x);
            }
-            x = ggml_concat(ctx, x, c_concat, 2);
+            x = ggml_concat(ctx, x, c_concat);
        }

        if (y != NULL) {
@ -491,7 +491,7 @@ public:
                    control_offset--;
                }

-                h = ggml_concat(ctx, h, h_skip, 2);
+                h = ggml_concat(ctx, h, h_skip);

                std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";