fix: use PRI64 instead of %i for some log (#269 )

sync: update ggml
perf: make crc32 100x faster on x86-64 (#278 )
2025-12-13 05:48:56 +00:00 · 2024-06-01 14:01:58 +08:00 · 2024-06-01 13:44:09 +08:00 · 2024-06-01 12:58:30 +08:00 · 2024-06-01 12:33:13 +08:00
13 changed files with 317 additions and 57 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -4,17 +4,36 @@ on:
  workflow_dispatch: # allows manual triggering
    inputs:
      create_release:
-        description: 'Create new release'
+        description: "Create new release"
        required: true
        type: boolean
  push:
    branches:
      - master
      - ci
-    paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths:
+      [
+        ".github/workflows/**",
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+      ]
  pull_request:
    types: [opened, synchronize, reopened]
-    paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
+    paths:
+      [
+        "**/CMakeLists.txt",
+        "**/Makefile",
+        "**/*.h",
+        "**/*.hpp",
+        "**/*.c",
+        "**/*.cpp",
+        "**/*.cu",
+      ]

 env:
  BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -67,12 +86,12 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

-
  macOS-latest-cmake:
    runs-on: macos-latest

@ -120,29 +139,29 @@ jobs:

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip

-
  windows-latest-cmake:
    runs-on: windows-latest

    strategy:
      matrix:
        include:
-          - build: 'noavx'
-            defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON'
-          - build: 'avx2'
-            defines: '-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON'
-          - build: 'avx'
-            defines: '-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON'
-          - build: 'avx512'
-            defines: '-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON'
-          - build: 'cuda12'
-            defines: '-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON'
-          - build: 'rocm5.5'
+          - build: "noavx"
+            defines: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "avx2"
+            defines: "-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "avx"
+            defines: "-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "avx512"
+            defines: "-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "cuda12"
+            defines: "-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON"
+          - build: "rocm5.5"
            defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
    steps:
      - name: Clone
@ -156,8 +175,8 @@ jobs:
        if: ${{ matrix.build == 'cuda12' }}
        uses: Jimver/cuda-toolkit@v0.2.11
        with:
-          cuda: '12.2.0'
-          method: 'network'
+          cuda: "12.2.0"
+          method: "network"
          sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'

      - name: Install rocm-toolkit
@ -165,7 +184,7 @@ jobs:
        if: ${{ matrix.build == 'rocm5.5' }}
        uses: Cyberhan123/rocm-toolkit@v0.1.0
        with:
-          rocm: '5.5.0'
+          rocm: "5.5.0"

      - name: Install Ninja
        id: install-ninja
@ -231,15 +250,17 @@ jobs:

      - name: Upload Cuda runtime
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
+          name: sd-cudart-sd-bin-win-cu12-x64.zip
          path: |
            cudart-sd-bin-win-cu12-x64.zip

      - name: Upload artifacts
        if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
+          name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
          path: |
            sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip

@ -256,7 +277,11 @@ jobs:
    steps:
      - name: Download artifacts
        id: download-artifact
-        uses: actions/download-artifact@v3
+        uses: actions/download-artifact@v4
+        with:
+          path: ./artifact
+          pattern: sd-*
+          merge-multiple: true

      - name: Get commit hash
        id: commit
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -34,7 +34,7 @@ option(SD_BUILD_SHARED_LIBS          "sd: build shared libs" OFF)

 if(SD_CUBLAS)
 	message("Use CUBLAS as backend stable-diffusion")
-    set(GGML_CUBLAS ON)
+    set(GGML_CUDA ON)
    add_definitions(-DSD_USE_CUBLAS)
 endif()

--- a/clip.hpp
+++ b/clip.hpp
@ -679,8 +679,8 @@ public:
        class_embedding                     = ggml_repeat(ctx, class_embed_weight, class_embedding);      // [N, embed_dim]
        class_embedding                     = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N);  // [N, 1, embed_dim, 1]

-        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding);    // [N, num_positions, embed_dim, 1]
-        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);  // [N, num_positions, embed_dim]
+        struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2);  // [N, num_positions, embed_dim, 1]
+        x                     = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N);   // [N, num_positions, embed_dim]
        x                     = ggml_add(ctx, x, position_embed_weight);
        return x;  // [N, num_positions, embed_dim]
    }
@ -1036,7 +1036,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
                                             hidden_states2->ne[3]);
            hidden_states2 = ggml_cont(ctx, ggml_permute(ctx, hidden_states2, 2, 0, 1, 3));

-            hidden_states = ggml_concat(ctx, hidden_states, hidden_states2);  // [N, n_token, hidden_size + hidden_size2]
+            hidden_states = ggml_concat(ctx, hidden_states, hidden_states2, 2);  // [N, n_token, hidden_size + hidden_size2]

            hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 1, 2, 0, 3));
        }
@ -1069,7 +1069,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
            auto token_embed_weight = text_model.get_token_embed_weight();
            token_embed_weight      = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
            // concatenate custom embeddings
-            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings);
+            embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 2);
            embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
        }

--- a/esrgan.hpp
+++ b/esrgan.hpp
@ -42,13 +42,13 @@ public:
        auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);

        auto x1    = lrelu(ctx, conv1->forward(ctx, x));
-        auto x_cat = ggml_concat(ctx, x, x1);
+        auto x_cat = ggml_concat(ctx, x, x1, 2);
        auto x2    = lrelu(ctx, conv2->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x2);
+        x_cat      = ggml_concat(ctx, x_cat, x2, 2);
        auto x3    = lrelu(ctx, conv3->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x3);
+        x_cat      = ggml_concat(ctx, x_cat, x3, 2);
        auto x4    = lrelu(ctx, conv4->forward(ctx, x_cat));
-        x_cat      = ggml_concat(ctx, x_cat, x4);
+        x_cat      = ggml_concat(ctx, x_cat, x4, 2);
        auto x5    = conv5->forward(ctx, x_cat);

        x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 57869ad3b7b1f49ae18e3238b0d69a9467a8f068
+Subproject commit 2aae01fd9b8f9399f343cf18f46f38996ef52e2c
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@ -917,7 +917,7 @@ public:
            return NULL;
        }
        // it's performing a compute, check if backend isn't cpu
-        if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
+        if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
            // pass input tensors to gpu memory
            auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);

--- a/model.cpp
+++ b/model.cpp
@ -571,10 +571,9 @@ void convert_tensor(void* src,
        if (dst_type == GGML_TYPE_F16) {
            ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
        } else {
-            int64_t hist[16];
            std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
            const float* im = imatrix.data();
-            ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, hist, im);
+            ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im);
        }
    } else if (dst_type == GGML_TYPE_F32) {
        if (src_type == GGML_TYPE_F16) {
@ -602,10 +601,9 @@ void convert_tensor(void* src,
        if (dst_type == GGML_TYPE_F16) {
            ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
        } else {
-            int64_t hist[16];
            std::vector<float> imatrix(n_per_row, 1.0f);  // dummy importance matrix
            const float* im = imatrix.data();
-            ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, hist, im);
+            ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im);
        }
    }
 }
--- a/pmid.hpp
+++ b/pmid.hpp
@ -64,7 +64,7 @@ public:
        auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
        auto id_embeds0     = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
        // concat is along dim 2
-        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0);
+        auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
        stacked_id_embeds      = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));

        // stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
@ -102,12 +102,12 @@ public:

        stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
        if (left && right) {
-            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds);
-            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right);
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
        } else if (left) {
-            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds);
+            stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
        } else if (right) {
-            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right);
+            stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
        }
        stacked_id_embeds                         = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
        class_tokens_mask                         = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
@ -146,7 +146,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
        id_embeds   = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
        id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));

-        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
+        id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2);  // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
        id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));

        struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@ -1717,7 +1717,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
    for (int b = 0; b < batch_count; b++) {
        int64_t sampling_start = ggml_time_ms();
        int64_t cur_seed       = seed + b;
-        LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed);
+        LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);

        sd_ctx->sd->rng->manual_seed(cur_seed);
        struct ggml_tensor* x_t   = NULL;
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@ -60,12 +60,11 @@ enum sd_type_t {
    SD_TYPE_Q4_0 = 2,
    SD_TYPE_Q4_1 = 3,
    // SD_TYPE_Q4_2 = 4, support has been removed
-    // SD_TYPE_Q4_3 (5) support has been removed
-    SD_TYPE_Q5_0 = 6,
-    SD_TYPE_Q5_1 = 7,
-    SD_TYPE_Q8_0 = 8,
-    SD_TYPE_Q8_1 = 9,
-    // k-quantizations
+    // SD_TYPE_Q4_3 = 5, support has been removed
+    SD_TYPE_Q5_0    = 6,
+    SD_TYPE_Q5_1    = 7,
+    SD_TYPE_Q8_0    = 8,
+    SD_TYPE_Q8_1    = 9,
    SD_TYPE_Q2_K    = 10,
    SD_TYPE_Q3_K    = 11,
    SD_TYPE_Q4_K    = 12,
@ -80,9 +79,13 @@ enum sd_type_t {
    SD_TYPE_IQ3_S   = 21,
    SD_TYPE_IQ2_S   = 22,
    SD_TYPE_IQ4_XS  = 23,
-    SD_TYPE_I8,
-    SD_TYPE_I16,
-    SD_TYPE_I32,
+    SD_TYPE_I8      = 24,
+    SD_TYPE_I16     = 25,
+    SD_TYPE_I32     = 26,
+    SD_TYPE_I64     = 27,
+    SD_TYPE_F64     = 28,
+    SD_TYPE_IQ1_M   = 29,
+    SD_TYPE_BF16    = 30,
    SD_TYPE_COUNT,
 };

--- a/thirdparty/.clang-format
+++ b/thirdparty/.clang-format
@ -0,0 +1,2 @@
+DisableFormat: true
+SortIncludes: Never
--- a/thirdparty/zip.c
+++ b/thirdparty/zip.c
@ -36,6 +36,7 @@
 #include <unistd.h>
 #endif

+#define USE_EXTERNAL_MZCRC
 #include "miniz.h"
 #include "zip.h"

@ -1834,3 +1835,234 @@ int zip_extract(const char *zipname, const char *dir,

  return zip_archive_extract(&zip_archive, dir, on_extract, arg);
 }
+
+#if defined(__SSE4_2__) || defined(__AVX512F__)
+#include <immintrin.h>
+#endif
+
+// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles
+// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX
+// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+//  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) {
+  uint32_t crc = ~init;
+#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__)
+  if (len >= 256) {
+    _Alignas(__m512) static const uint64_t k1k2[] = {
+        0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
+        0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
+    };
+    _Alignas(__m512) static const uint64_t k3k4[] = {
+        0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
+        0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
+    };
+    _Alignas(__m512) static const uint64_t k5k6[] = {
+        0x01751997d0,
+        0x00ccaa009e,
+    };
+    _Alignas(__m512) static const uint64_t k7k8[] = {
+        0x0163cd6124,
+        0x0000000000,
+    };
+    _Alignas(__m512) static const uint64_t poly[] = {
+        0x01db710641,
+        0x01f7011641,
+    };
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+    __m128i a0, a1, a2, a3;
+    x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
+    x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
+    x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
+    x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
+    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+    x0 = _mm512_load_si512((__m512i *)k1k2);
+    buf += 256;
+    len -= 256;
+    while (len >= 256) {
+      x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+      x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+      x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+      x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
+      x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+      x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+      x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+      x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
+      y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
+      y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
+      y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
+      y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
+      x1 = _mm512_xor_si512(x1, x5);
+      x2 = _mm512_xor_si512(x2, x6);
+      x3 = _mm512_xor_si512(x3, x7);
+      x4 = _mm512_xor_si512(x4, x8);
+      x1 = _mm512_xor_si512(x1, y5);
+      x2 = _mm512_xor_si512(x2, y6);
+      x3 = _mm512_xor_si512(x3, y7);
+      x4 = _mm512_xor_si512(x4, y8);
+      buf += 256;
+      len -= 256;
+    }
+    x0 = _mm512_load_si512((__m512i *)k3k4);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x2);
+    x1 = _mm512_xor_si512(x1, x5);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x3);
+    x1 = _mm512_xor_si512(x1, x5);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x4);
+    x1 = _mm512_xor_si512(x1, x5);
+    while (len >= 64) {
+      x2 = _mm512_loadu_si512((__m512i *)buf);
+      x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+      x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+      x1 = _mm512_xor_si512(x1, x2);
+      x1 = _mm512_xor_si512(x1, x5);
+      buf += 64;
+      len -= 64;
+    }
+    a0 = _mm_load_si128((__m128i *)k5k6);
+    a1 = _mm512_extracti32x4_epi32(x1, 0);
+    a2 = _mm512_extracti32x4_epi32(x1, 1);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+    a2 = _mm512_extracti32x4_epi32(x1, 2);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+    a2 = _mm512_extracti32x4_epi32(x1, 3);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
+    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    a1 = _mm_srli_si128(a1, 8);
+    a1 = _mm_xor_si128(a1, a2);
+    a0 = _mm_loadl_epi64((__m128i *)k7k8);
+    a2 = _mm_srli_si128(a1, 4);
+    a1 = _mm_and_si128(a1, a3);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+    a0 = _mm_load_si128((__m128i *)poly);
+    a2 = _mm_and_si128(a1, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
+    a2 = _mm_and_si128(a2, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+    crc = _mm_extract_epi32(a1, 1);
+  }
+#endif
+#if defined(__SSE4_2__) && defined(__PCLMUL__)
+  if (len >= 64) {
+    _Alignas(__m128) static const uint64_t k1k2[] = {
+        0x0154442bd4,
+        0x01c6e41596,
+    };
+    _Alignas(__m128) static const uint64_t k3k4[] = {
+        0x01751997d0,
+        0x00ccaa009e,
+    };
+    _Alignas(__m128) static const uint64_t k5k0[] = {
+        0x0163cd6124,
+        0x0000000000,
+    };
+    _Alignas(__m128) static const uint64_t poly[] = {
+        0x01db710641,
+        0x01f7011641,
+    };
+    __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+    x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+    x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+    x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+    x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+    x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
+    x0 = _mm_load_si128((__m128i *)k1k2);
+    buf += 64;
+    len -= 64;
+    while (len >= 64) {
+      x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+      x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
+      x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
+      x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
+      x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+      x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
+      x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
+      x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
+      y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
+      y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
+      y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
+      y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
+      x1 = _mm_xor_si128(x1, x5);
+      x2 = _mm_xor_si128(x2, x6);
+      x3 = _mm_xor_si128(x3, x7);
+      x4 = _mm_xor_si128(x4, x8);
+      x1 = _mm_xor_si128(x1, y5);
+      x2 = _mm_xor_si128(x2, y6);
+      x3 = _mm_xor_si128(x3, y7);
+      x4 = _mm_xor_si128(x4, y8);
+      buf += 64;
+      len -= 64;
+    }
+    x0 = _mm_load_si128((__m128i *)k3k4);
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x2);
+    x1 = _mm_xor_si128(x1, x5);
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x3);
+    x1 = _mm_xor_si128(x1, x5);
+    x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+    x1 = _mm_xor_si128(x1, x4);
+    x1 = _mm_xor_si128(x1, x5);
+    while (len >= 16) {
+      x2 = _mm_loadu_si128((__m128i *)buf);
+      x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
+      x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
+      x1 = _mm_xor_si128(x1, x2);
+      x1 = _mm_xor_si128(x1, x5);
+      buf += 16;
+      len -= 16;
+    }
+    x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
+    x3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    x1 = _mm_srli_si128(x1, 8);
+    x1 = _mm_xor_si128(x1, x2);
+    x0 = _mm_loadl_epi64((__m128i *)k5k0);
+    x2 = _mm_srli_si128(x1, 4);
+    x1 = _mm_and_si128(x1, x3);
+    x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+    x0 = _mm_load_si128((__m128i *)poly);
+    x2 = _mm_and_si128(x1, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
+    x2 = _mm_and_si128(x2, x3);
+    x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
+    x1 = _mm_xor_si128(x1, x2);
+    crc = _mm_extract_epi32(x1, 1);
+  }
+#endif
+  static uint32_t tab[256];
+  if (!tab[255]) {
+    // generates table for byte-wise crc calculation on the polynomial
+    // x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
+    uint32_t polynomial = 0xedb88320;  // bits are reversed
+    for (int d = 0; d < 256; ++d) {
+      uint32_t r = d;
+      for (int i = 0; i < 8; ++i)
+        r = r >> 1 ^ (r & 1 ? polynomial : 0);
+      tab[d] = r;
+    }
+  }
+  for (size_t i = 0; i < len; ++i)
+    crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]];
+  return ~crc & 0xffffffff;
+}
--- a/unet.hpp
+++ b/unet.hpp
@ -396,7 +396,7 @@ public:
            if (c_concat->ne[3] != x->ne[3]) {
                c_concat = ggml_repeat(ctx, c_concat, x);
            }
-            x = ggml_concat(ctx, x, c_concat);
+            x = ggml_concat(ctx, x, c_concat, 2);
        }

        if (y != NULL) {
@ -491,7 +491,7 @@ public:
                    control_offset--;
                }

-                h = ggml_concat(ctx, h, h_skip);
+                h = ggml_concat(ctx, h, h_skip, 2);

                std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
Author	SHA1	Message	Date
Eugene	b6daf5c55b	fix: use PRI64 instead of %i for some log (#269 )	2024-06-01 14:01:58 +08:00
leejet	be6cd1a4bf	sync: update ggml	2024-06-01 13:44:09 +08:00
Justine Tunney	e1384defca	perf: make crc32 100x faster on x86-64 (#278 ) This change makes checkpoints load significantly faster by optimizing pkzip's cyclic redundancy check. This code was developed by Intel and Google and Mozilla. See Chromium's zlib codebase for further details.	2024-06-01 12:58:30 +08:00
Phu Tran	814280343c	chore: update artifact actions (#267 )	2024-06-01 12:33:13 +08:00