Compare commits

...

4 Commits

Author SHA1 Message Date
Eugene
b6daf5c55b
fix: use PRI64 instead of %i for some log (#269) 2024-06-01 14:01:58 +08:00
leejet
be6cd1a4bf sync: update ggml 2024-06-01 13:44:09 +08:00
Justine Tunney
e1384defca
perf: make crc32 100x faster on x86-64 (#278)
This change makes checkpoints load significantly faster by optimizing
pkzip's cyclic redundancy check. This code was developed by Intel and
Google and Mozilla. See Chromium's zlib codebase for further details.
2024-06-01 12:58:30 +08:00
Phu Tran
814280343c
chore: update artifact actions (#267) 2024-06-01 12:33:13 +08:00
13 changed files with 317 additions and 57 deletions

View File

@ -4,17 +4,36 @@ on:
workflow_dispatch: # allows manual triggering
inputs:
create_release:
description: 'Create new release'
description: "Create new release"
required: true
type: boolean
push:
branches:
- master
- ci
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
paths:
[
".github/workflows/**",
"**/CMakeLists.txt",
"**/Makefile",
"**/*.h",
"**/*.hpp",
"**/*.c",
"**/*.cpp",
"**/*.cu",
]
pull_request:
types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
paths:
[
"**/CMakeLists.txt",
"**/Makefile",
"**/*.h",
"**/*.hpp",
"**/*.c",
"**/*.cpp",
"**/*.cu",
]
env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -67,12 +86,12 @@ jobs:
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
macOS-latest-cmake:
runs-on: macos-latest
@ -120,29 +139,29 @@ jobs:
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
windows-latest-cmake:
runs-on: windows-latest
strategy:
matrix:
include:
- build: 'noavx'
defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON'
- build: 'avx2'
defines: '-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON'
- build: 'avx'
defines: '-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON'
- build: 'avx512'
defines: '-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON'
- build: 'cuda12'
defines: '-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON'
- build: 'rocm5.5'
- build: "noavx"
defines: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
- build: "avx2"
defines: "-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "avx"
defines: "-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
- build: "avx512"
defines: "-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "cuda12"
defines: "-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: "rocm5.5"
defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
steps:
- name: Clone
@ -156,8 +175,8 @@ jobs:
if: ${{ matrix.build == 'cuda12' }}
uses: Jimver/cuda-toolkit@v0.2.11
with:
cuda: '12.2.0'
method: 'network'
cuda: "12.2.0"
method: "network"
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Install rocm-toolkit
@ -165,7 +184,7 @@ jobs:
if: ${{ matrix.build == 'rocm5.5' }}
uses: Cyberhan123/rocm-toolkit@v0.1.0
with:
rocm: '5.5.0'
rocm: "5.5.0"
- name: Install Ninja
id: install-ninja
@ -231,15 +250,17 @@ jobs:
- name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: sd-cudart-sd-bin-win-cu12-x64.zip
path: |
cudart-sd-bin-win-cu12-x64.zip
- name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3
uses: actions/upload-artifact@v4
with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
@ -256,7 +277,11 @@ jobs:
steps:
- name: Download artifacts
id: download-artifact
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
path: ./artifact
pattern: sd-*
merge-multiple: true
- name: Get commit hash
id: commit

View File

@ -34,7 +34,7 @@ option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
if(SD_CUBLAS)
message("Use CUBLAS as backend stable-diffusion")
set(GGML_CUBLAS ON)
set(GGML_CUDA ON)
add_definitions(-DSD_USE_CUBLAS)
endif()

View File

@ -679,8 +679,8 @@ public:
class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim]
class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding); // [N, num_positions, embed_dim, 1]
x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
x = ggml_add(ctx, x, position_embed_weight);
return x; // [N, num_positions, embed_dim]
}
@ -1036,7 +1036,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
hidden_states2->ne[3]);
hidden_states2 = ggml_cont(ctx, ggml_permute(ctx, hidden_states2, 2, 0, 1, 3));
hidden_states = ggml_concat(ctx, hidden_states, hidden_states2); // [N, n_token, hidden_size + hidden_size2]
hidden_states = ggml_concat(ctx, hidden_states, hidden_states2, 2); // [N, n_token, hidden_size + hidden_size2]
hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 1, 2, 0, 3));
}
@ -1069,7 +1069,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
auto token_embed_weight = text_model.get_token_embed_weight();
token_embed_weight = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
// concatenate custom embeddings
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings);
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 2);
embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
}

View File

@ -42,13 +42,13 @@ public:
auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
auto x1 = lrelu(ctx, conv1->forward(ctx, x));
auto x_cat = ggml_concat(ctx, x, x1);
auto x_cat = ggml_concat(ctx, x, x1, 2);
auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x2);
x_cat = ggml_concat(ctx, x_cat, x2, 2);
auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x3);
x_cat = ggml_concat(ctx, x_cat, x3, 2);
auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x4);
x_cat = ggml_concat(ctx, x_cat, x4, 2);
auto x5 = conv5->forward(ctx, x_cat);
x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);

2
ggml

@ -1 +1 @@
Subproject commit 57869ad3b7b1f49ae18e3238b0d69a9467a8f068
Subproject commit 2aae01fd9b8f9399f343cf18f46f38996ef52e2c

View File

@ -917,7 +917,7 @@ public:
return NULL;
}
// it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
// pass input tensors to gpu memory
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);

View File

@ -571,10 +571,9 @@ void convert_tensor(void* src,
if (dst_type == GGML_TYPE_F16) {
ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
} else {
int64_t hist[16];
std::vector<float> imatrix(n_per_row, 1.0f); // dummy importance matrix
const float* im = imatrix.data();
ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, hist, im);
ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im);
}
} else if (dst_type == GGML_TYPE_F32) {
if (src_type == GGML_TYPE_F16) {
@ -602,10 +601,9 @@ void convert_tensor(void* src,
if (dst_type == GGML_TYPE_F16) {
ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
} else {
int64_t hist[16];
std::vector<float> imatrix(n_per_row, 1.0f); // dummy importance matrix
const float* im = imatrix.data();
ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, hist, im);
ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im);
}
}
}

View File

@ -64,7 +64,7 @@ public:
auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
auto id_embeds0 = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
// concat is along dim 2
auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0);
auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));
// stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
@ -102,12 +102,12 @@ public:
stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
if (left && right) {
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds);
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right);
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
} else if (left) {
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds);
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
} else if (right) {
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right);
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
}
stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
class_tokens_mask = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
@ -146,7 +146,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));
id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));
struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,

View File

@ -1717,7 +1717,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
for (int b = 0; b < batch_count; b++) {
int64_t sampling_start = ggml_time_ms();
int64_t cur_seed = seed + b;
LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed);
LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
sd_ctx->sd->rng->manual_seed(cur_seed);
struct ggml_tensor* x_t = NULL;

View File

@ -60,12 +60,11 @@ enum sd_type_t {
SD_TYPE_Q4_0 = 2,
SD_TYPE_Q4_1 = 3,
// SD_TYPE_Q4_2 = 4, support has been removed
// SD_TYPE_Q4_3 (5) support has been removed
SD_TYPE_Q5_0 = 6,
SD_TYPE_Q5_1 = 7,
SD_TYPE_Q8_0 = 8,
SD_TYPE_Q8_1 = 9,
// k-quantizations
// SD_TYPE_Q4_3 = 5, support has been removed
SD_TYPE_Q5_0 = 6,
SD_TYPE_Q5_1 = 7,
SD_TYPE_Q8_0 = 8,
SD_TYPE_Q8_1 = 9,
SD_TYPE_Q2_K = 10,
SD_TYPE_Q3_K = 11,
SD_TYPE_Q4_K = 12,
@ -80,9 +79,13 @@ enum sd_type_t {
SD_TYPE_IQ3_S = 21,
SD_TYPE_IQ2_S = 22,
SD_TYPE_IQ4_XS = 23,
SD_TYPE_I8,
SD_TYPE_I16,
SD_TYPE_I32,
SD_TYPE_I8 = 24,
SD_TYPE_I16 = 25,
SD_TYPE_I32 = 26,
SD_TYPE_I64 = 27,
SD_TYPE_F64 = 28,
SD_TYPE_IQ1_M = 29,
SD_TYPE_BF16 = 30,
SD_TYPE_COUNT,
};

2
thirdparty/.clang-format vendored Normal file
View File

@ -0,0 +1,2 @@
DisableFormat: true
SortIncludes: Never

232
thirdparty/zip.c vendored
View File

@ -36,6 +36,7 @@
#include <unistd.h>
#endif
#define USE_EXTERNAL_MZCRC
#include "miniz.h"
#include "zip.h"
@ -1834,3 +1835,234 @@ int zip_extract(const char *zipname, const char *dir,
return zip_archive_extract(&zip_archive, dir, on_extract, arg);
}
#if defined(__SSE4_2__) || defined(__AVX512F__)
#include <immintrin.h>
#endif
// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles
// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
// V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) {
uint32_t crc = ~init;
#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__)
if (len >= 256) {
_Alignas(__m512) static const uint64_t k1k2[] = {
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
};
_Alignas(__m512) static const uint64_t k3k4[] = {
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
};
_Alignas(__m512) static const uint64_t k5k6[] = {
0x01751997d0,
0x00ccaa009e,
};
_Alignas(__m512) static const uint64_t k7k8[] = {
0x0163cd6124,
0x0000000000,
};
_Alignas(__m512) static const uint64_t poly[] = {
0x01db710641,
0x01f7011641,
};
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
__m128i a0, a1, a2, a3;
x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
x0 = _mm512_load_si512((__m512i *)k1k2);
buf += 256;
len -= 256;
while (len >= 256) {
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, x5);
x2 = _mm512_xor_si512(x2, x6);
x3 = _mm512_xor_si512(x3, x7);
x4 = _mm512_xor_si512(x4, x8);
x1 = _mm512_xor_si512(x1, y5);
x2 = _mm512_xor_si512(x2, y6);
x3 = _mm512_xor_si512(x3, y7);
x4 = _mm512_xor_si512(x4, y8);
buf += 256;
len -= 256;
}
x0 = _mm512_load_si512((__m512i *)k3k4);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x3);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x4);
x1 = _mm512_xor_si512(x1, x5);
while (len >= 64) {
x2 = _mm512_loadu_si512((__m512i *)buf);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
buf += 64;
len -= 64;
}
a0 = _mm_load_si128((__m128i *)k5k6);
a1 = _mm512_extracti32x4_epi32(x1, 0);
a2 = _mm512_extracti32x4_epi32(x1, 1);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm512_extracti32x4_epi32(x1, 2);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm512_extracti32x4_epi32(x1, 3);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
a3 = _mm_setr_epi32(~0, 0, ~0, 0);
a1 = _mm_srli_si128(a1, 8);
a1 = _mm_xor_si128(a1, a2);
a0 = _mm_loadl_epi64((__m128i *)k7k8);
a2 = _mm_srli_si128(a1, 4);
a1 = _mm_and_si128(a1, a3);
a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
a0 = _mm_load_si128((__m128i *)poly);
a2 = _mm_and_si128(a1, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
a2 = _mm_and_si128(a2, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
crc = _mm_extract_epi32(a1, 1);
}
#endif
#if defined(__SSE4_2__) && defined(__PCLMUL__)
if (len >= 64) {
_Alignas(__m128) static const uint64_t k1k2[] = {
0x0154442bd4,
0x01c6e41596,
};
_Alignas(__m128) static const uint64_t k3k4[] = {
0x01751997d0,
0x00ccaa009e,
};
_Alignas(__m128) static const uint64_t k5k0[] = {
0x0163cd6124,
0x0000000000,
};
_Alignas(__m128) static const uint64_t poly[] = {
0x01db710641,
0x01f7011641,
};
__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
x0 = _mm_load_si128((__m128i *)k1k2);
buf += 64;
len -= 64;
while (len >= 64) {
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_xor_si128(x4, x8);
x1 = _mm_xor_si128(x1, y5);
x2 = _mm_xor_si128(x2, y6);
x3 = _mm_xor_si128(x3, y7);
x4 = _mm_xor_si128(x4, y8);
buf += 64;
len -= 64;
}
x0 = _mm_load_si128((__m128i *)k3k4);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x3);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x4);
x1 = _mm_xor_si128(x1, x5);
while (len >= 16) {
x2 = _mm_loadu_si128((__m128i *)buf);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
buf += 16;
len -= 16;
}
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
x3 = _mm_setr_epi32(~0, 0, ~0, 0);
x1 = _mm_srli_si128(x1, 8);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_loadl_epi64((__m128i *)k5k0);
x2 = _mm_srli_si128(x1, 4);
x1 = _mm_and_si128(x1, x3);
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_load_si128((__m128i *)poly);
x2 = _mm_and_si128(x1, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
x2 = _mm_and_si128(x2, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
crc = _mm_extract_epi32(x1, 1);
}
#endif
static uint32_t tab[256];
if (!tab[255]) {
// generates table for byte-wise crc calculation on the polynomial
// x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
uint32_t polynomial = 0xedb88320; // bits are reversed
for (int d = 0; d < 256; ++d) {
uint32_t r = d;
for (int i = 0; i < 8; ++i)
r = r >> 1 ^ (r & 1 ? polynomial : 0);
tab[d] = r;
}
}
for (size_t i = 0; i < len; ++i)
crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]];
return ~crc & 0xffffffff;
}

View File

@ -396,7 +396,7 @@ public:
if (c_concat->ne[3] != x->ne[3]) {
c_concat = ggml_repeat(ctx, c_concat, x);
}
x = ggml_concat(ctx, x, c_concat);
x = ggml_concat(ctx, x, c_concat, 2);
}
if (y != NULL) {
@ -491,7 +491,7 @@ public:
control_offset--;
}
h = ggml_concat(ctx, h, h_skip);
h = ggml_concat(ctx, h, h_skip, 2);
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";