mirror of
https://github.com/leejet/stable-diffusion.cpp.git
synced 2025-12-13 05:48:56 +00:00
Compare commits
4 Commits
1d2af5ca3f
...
b6daf5c55b
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b6daf5c55b | ||
|
|
be6cd1a4bf | ||
|
|
e1384defca | ||
|
|
814280343c |
73
.github/workflows/build.yml
vendored
73
.github/workflows/build.yml
vendored
@ -4,17 +4,36 @@ on:
|
||||
workflow_dispatch: # allows manual triggering
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
description: "Create new release"
|
||||
required: true
|
||||
type: boolean
|
||||
push:
|
||||
branches:
|
||||
- master
|
||||
- ci
|
||||
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
||||
paths:
|
||||
[
|
||||
".github/workflows/**",
|
||||
"**/CMakeLists.txt",
|
||||
"**/Makefile",
|
||||
"**/*.h",
|
||||
"**/*.hpp",
|
||||
"**/*.c",
|
||||
"**/*.cpp",
|
||||
"**/*.cu",
|
||||
]
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
|
||||
paths:
|
||||
[
|
||||
"**/CMakeLists.txt",
|
||||
"**/Makefile",
|
||||
"**/*.h",
|
||||
"**/*.hpp",
|
||||
"**/*.c",
|
||||
"**/*.cpp",
|
||||
"**/*.cu",
|
||||
]
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
@ -67,12 +86,12 @@ jobs:
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
|
||||
path: |
|
||||
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
|
||||
|
||||
|
||||
macOS-latest-cmake:
|
||||
runs-on: macos-latest
|
||||
|
||||
@ -120,29 +139,29 @@ jobs:
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
|
||||
path: |
|
||||
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
|
||||
|
||||
|
||||
windows-latest-cmake:
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- build: 'noavx'
|
||||
defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx2'
|
||||
defines: '-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx'
|
||||
defines: '-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON'
|
||||
- build: 'avx512'
|
||||
defines: '-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON'
|
||||
- build: 'cuda12'
|
||||
defines: '-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON'
|
||||
- build: 'rocm5.5'
|
||||
- build: "noavx"
|
||||
defines: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
|
||||
- build: "avx2"
|
||||
defines: "-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
|
||||
- build: "avx"
|
||||
defines: "-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
|
||||
- build: "avx512"
|
||||
defines: "-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON"
|
||||
- build: "cuda12"
|
||||
defines: "-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON"
|
||||
- build: "rocm5.5"
|
||||
defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
|
||||
steps:
|
||||
- name: Clone
|
||||
@ -156,8 +175,8 @@ jobs:
|
||||
if: ${{ matrix.build == 'cuda12' }}
|
||||
uses: Jimver/cuda-toolkit@v0.2.11
|
||||
with:
|
||||
cuda: '12.2.0'
|
||||
method: 'network'
|
||||
cuda: "12.2.0"
|
||||
method: "network"
|
||||
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
|
||||
|
||||
- name: Install rocm-toolkit
|
||||
@ -165,7 +184,7 @@ jobs:
|
||||
if: ${{ matrix.build == 'rocm5.5' }}
|
||||
uses: Cyberhan123/rocm-toolkit@v0.1.0
|
||||
with:
|
||||
rocm: '5.5.0'
|
||||
rocm: "5.5.0"
|
||||
|
||||
- name: Install Ninja
|
||||
id: install-ninja
|
||||
@ -231,15 +250,17 @@ jobs:
|
||||
|
||||
- name: Upload Cuda runtime
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: sd-cudart-sd-bin-win-cu12-x64.zip
|
||||
path: |
|
||||
cudart-sd-bin-win-cu12-x64.zip
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
uses: actions/upload-artifact@v3
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
|
||||
path: |
|
||||
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
|
||||
|
||||
@ -256,7 +277,11 @@ jobs:
|
||||
steps:
|
||||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v3
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
pattern: sd-*
|
||||
merge-multiple: true
|
||||
|
||||
- name: Get commit hash
|
||||
id: commit
|
||||
|
||||
@ -34,7 +34,7 @@ option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
|
||||
|
||||
if(SD_CUBLAS)
|
||||
message("Use CUBLAS as backend stable-diffusion")
|
||||
set(GGML_CUBLAS ON)
|
||||
set(GGML_CUDA ON)
|
||||
add_definitions(-DSD_USE_CUBLAS)
|
||||
endif()
|
||||
|
||||
|
||||
8
clip.hpp
8
clip.hpp
@ -679,8 +679,8 @@ public:
|
||||
class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim]
|
||||
class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
|
||||
|
||||
struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding); // [N, num_positions, embed_dim, 1]
|
||||
x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
|
||||
struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
|
||||
x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
|
||||
x = ggml_add(ctx, x, position_embed_weight);
|
||||
return x; // [N, num_positions, embed_dim]
|
||||
}
|
||||
@ -1036,7 +1036,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
|
||||
hidden_states2->ne[3]);
|
||||
hidden_states2 = ggml_cont(ctx, ggml_permute(ctx, hidden_states2, 2, 0, 1, 3));
|
||||
|
||||
hidden_states = ggml_concat(ctx, hidden_states, hidden_states2); // [N, n_token, hidden_size + hidden_size2]
|
||||
hidden_states = ggml_concat(ctx, hidden_states, hidden_states2, 2); // [N, n_token, hidden_size + hidden_size2]
|
||||
|
||||
hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 1, 2, 0, 3));
|
||||
}
|
||||
@ -1069,7 +1069,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
|
||||
auto token_embed_weight = text_model.get_token_embed_weight();
|
||||
token_embed_weight = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
|
||||
// concatenate custom embeddings
|
||||
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings);
|
||||
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 2);
|
||||
embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
|
||||
}
|
||||
|
||||
|
||||
@ -42,13 +42,13 @@ public:
|
||||
auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
|
||||
|
||||
auto x1 = lrelu(ctx, conv1->forward(ctx, x));
|
||||
auto x_cat = ggml_concat(ctx, x, x1);
|
||||
auto x_cat = ggml_concat(ctx, x, x1, 2);
|
||||
auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat));
|
||||
x_cat = ggml_concat(ctx, x_cat, x2);
|
||||
x_cat = ggml_concat(ctx, x_cat, x2, 2);
|
||||
auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat));
|
||||
x_cat = ggml_concat(ctx, x_cat, x3);
|
||||
x_cat = ggml_concat(ctx, x_cat, x3, 2);
|
||||
auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat));
|
||||
x_cat = ggml_concat(ctx, x_cat, x4);
|
||||
x_cat = ggml_concat(ctx, x_cat, x4, 2);
|
||||
auto x5 = conv5->forward(ctx, x_cat);
|
||||
|
||||
x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);
|
||||
|
||||
2
ggml
2
ggml
@ -1 +1 @@
|
||||
Subproject commit 57869ad3b7b1f49ae18e3238b0d69a9467a8f068
|
||||
Subproject commit 2aae01fd9b8f9399f343cf18f46f38996ef52e2c
|
||||
@ -917,7 +917,7 @@ public:
|
||||
return NULL;
|
||||
}
|
||||
// it's performing a compute, check if backend isn't cpu
|
||||
if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) {
|
||||
if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
|
||||
// pass input tensors to gpu memory
|
||||
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);
|
||||
|
||||
|
||||
@ -571,10 +571,9 @@ void convert_tensor(void* src,
|
||||
if (dst_type == GGML_TYPE_F16) {
|
||||
ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
|
||||
} else {
|
||||
int64_t hist[16];
|
||||
std::vector<float> imatrix(n_per_row, 1.0f); // dummy importance matrix
|
||||
const float* im = imatrix.data();
|
||||
ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, hist, im);
|
||||
ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im);
|
||||
}
|
||||
} else if (dst_type == GGML_TYPE_F32) {
|
||||
if (src_type == GGML_TYPE_F16) {
|
||||
@ -602,10 +601,9 @@ void convert_tensor(void* src,
|
||||
if (dst_type == GGML_TYPE_F16) {
|
||||
ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
|
||||
} else {
|
||||
int64_t hist[16];
|
||||
std::vector<float> imatrix(n_per_row, 1.0f); // dummy importance matrix
|
||||
const float* im = imatrix.data();
|
||||
ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, hist, im);
|
||||
ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
12
pmid.hpp
12
pmid.hpp
@ -64,7 +64,7 @@ public:
|
||||
auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
|
||||
auto id_embeds0 = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
|
||||
// concat is along dim 2
|
||||
auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0);
|
||||
auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
|
||||
stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));
|
||||
|
||||
// stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
|
||||
@ -102,12 +102,12 @@ public:
|
||||
|
||||
stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
|
||||
if (left && right) {
|
||||
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds);
|
||||
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right);
|
||||
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
|
||||
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
|
||||
} else if (left) {
|
||||
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds);
|
||||
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
|
||||
} else if (right) {
|
||||
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right);
|
||||
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
|
||||
}
|
||||
stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
|
||||
class_tokens_mask = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
|
||||
@ -146,7 +146,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
|
||||
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
|
||||
id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));
|
||||
|
||||
id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
|
||||
id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
|
||||
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));
|
||||
|
||||
struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,
|
||||
|
||||
@ -1717,7 +1717,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
|
||||
for (int b = 0; b < batch_count; b++) {
|
||||
int64_t sampling_start = ggml_time_ms();
|
||||
int64_t cur_seed = seed + b;
|
||||
LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed);
|
||||
LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
|
||||
|
||||
sd_ctx->sd->rng->manual_seed(cur_seed);
|
||||
struct ggml_tensor* x_t = NULL;
|
||||
|
||||
@ -60,12 +60,11 @@ enum sd_type_t {
|
||||
SD_TYPE_Q4_0 = 2,
|
||||
SD_TYPE_Q4_1 = 3,
|
||||
// SD_TYPE_Q4_2 = 4, support has been removed
|
||||
// SD_TYPE_Q4_3 (5) support has been removed
|
||||
SD_TYPE_Q5_0 = 6,
|
||||
SD_TYPE_Q5_1 = 7,
|
||||
SD_TYPE_Q8_0 = 8,
|
||||
SD_TYPE_Q8_1 = 9,
|
||||
// k-quantizations
|
||||
// SD_TYPE_Q4_3 = 5, support has been removed
|
||||
SD_TYPE_Q5_0 = 6,
|
||||
SD_TYPE_Q5_1 = 7,
|
||||
SD_TYPE_Q8_0 = 8,
|
||||
SD_TYPE_Q8_1 = 9,
|
||||
SD_TYPE_Q2_K = 10,
|
||||
SD_TYPE_Q3_K = 11,
|
||||
SD_TYPE_Q4_K = 12,
|
||||
@ -80,9 +79,13 @@ enum sd_type_t {
|
||||
SD_TYPE_IQ3_S = 21,
|
||||
SD_TYPE_IQ2_S = 22,
|
||||
SD_TYPE_IQ4_XS = 23,
|
||||
SD_TYPE_I8,
|
||||
SD_TYPE_I16,
|
||||
SD_TYPE_I32,
|
||||
SD_TYPE_I8 = 24,
|
||||
SD_TYPE_I16 = 25,
|
||||
SD_TYPE_I32 = 26,
|
||||
SD_TYPE_I64 = 27,
|
||||
SD_TYPE_F64 = 28,
|
||||
SD_TYPE_IQ1_M = 29,
|
||||
SD_TYPE_BF16 = 30,
|
||||
SD_TYPE_COUNT,
|
||||
};
|
||||
|
||||
|
||||
2
thirdparty/.clang-format
vendored
Normal file
2
thirdparty/.clang-format
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
DisableFormat: true
|
||||
SortIncludes: Never
|
||||
232
thirdparty/zip.c
vendored
232
thirdparty/zip.c
vendored
@ -36,6 +36,7 @@
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#define USE_EXTERNAL_MZCRC
|
||||
#include "miniz.h"
|
||||
#include "zip.h"
|
||||
|
||||
@ -1834,3 +1835,234 @@ int zip_extract(const char *zipname, const char *dir,
|
||||
|
||||
return zip_archive_extract(&zip_archive, dir, on_extract, arg);
|
||||
}
|
||||
|
||||
#if defined(__SSE4_2__) || defined(__AVX512F__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles
|
||||
// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX
|
||||
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
|
||||
// V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
|
||||
mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) {
|
||||
uint32_t crc = ~init;
|
||||
#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__)
|
||||
if (len >= 256) {
|
||||
_Alignas(__m512) static const uint64_t k1k2[] = {
|
||||
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
|
||||
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
|
||||
};
|
||||
_Alignas(__m512) static const uint64_t k3k4[] = {
|
||||
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
|
||||
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
|
||||
};
|
||||
_Alignas(__m512) static const uint64_t k5k6[] = {
|
||||
0x01751997d0,
|
||||
0x00ccaa009e,
|
||||
};
|
||||
_Alignas(__m512) static const uint64_t k7k8[] = {
|
||||
0x0163cd6124,
|
||||
0x0000000000,
|
||||
};
|
||||
_Alignas(__m512) static const uint64_t poly[] = {
|
||||
0x01db710641,
|
||||
0x01f7011641,
|
||||
};
|
||||
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
|
||||
__m128i a0, a1, a2, a3;
|
||||
x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
|
||||
x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
|
||||
x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
|
||||
x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
|
||||
x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
|
||||
x0 = _mm512_load_si512((__m512i *)k1k2);
|
||||
buf += 256;
|
||||
len -= 256;
|
||||
while (len >= 256) {
|
||||
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
|
||||
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
|
||||
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
|
||||
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
|
||||
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
|
||||
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
|
||||
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
|
||||
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
|
||||
y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
|
||||
y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
|
||||
y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
|
||||
y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
|
||||
x1 = _mm512_xor_si512(x1, x5);
|
||||
x2 = _mm512_xor_si512(x2, x6);
|
||||
x3 = _mm512_xor_si512(x3, x7);
|
||||
x4 = _mm512_xor_si512(x4, x8);
|
||||
x1 = _mm512_xor_si512(x1, y5);
|
||||
x2 = _mm512_xor_si512(x2, y6);
|
||||
x3 = _mm512_xor_si512(x3, y7);
|
||||
x4 = _mm512_xor_si512(x4, y8);
|
||||
buf += 256;
|
||||
len -= 256;
|
||||
}
|
||||
x0 = _mm512_load_si512((__m512i *)k3k4);
|
||||
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
|
||||
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
|
||||
x1 = _mm512_xor_si512(x1, x2);
|
||||
x1 = _mm512_xor_si512(x1, x5);
|
||||
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
|
||||
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
|
||||
x1 = _mm512_xor_si512(x1, x3);
|
||||
x1 = _mm512_xor_si512(x1, x5);
|
||||
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
|
||||
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
|
||||
x1 = _mm512_xor_si512(x1, x4);
|
||||
x1 = _mm512_xor_si512(x1, x5);
|
||||
while (len >= 64) {
|
||||
x2 = _mm512_loadu_si512((__m512i *)buf);
|
||||
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
|
||||
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
|
||||
x1 = _mm512_xor_si512(x1, x2);
|
||||
x1 = _mm512_xor_si512(x1, x5);
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
}
|
||||
a0 = _mm_load_si128((__m128i *)k5k6);
|
||||
a1 = _mm512_extracti32x4_epi32(x1, 0);
|
||||
a2 = _mm512_extracti32x4_epi32(x1, 1);
|
||||
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
|
||||
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
|
||||
a1 = _mm_xor_si128(a1, a3);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
a2 = _mm512_extracti32x4_epi32(x1, 2);
|
||||
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
|
||||
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
|
||||
a1 = _mm_xor_si128(a1, a3);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
a2 = _mm512_extracti32x4_epi32(x1, 3);
|
||||
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
|
||||
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
|
||||
a1 = _mm_xor_si128(a1, a3);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
|
||||
a3 = _mm_setr_epi32(~0, 0, ~0, 0);
|
||||
a1 = _mm_srli_si128(a1, 8);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
a0 = _mm_loadl_epi64((__m128i *)k7k8);
|
||||
a2 = _mm_srli_si128(a1, 4);
|
||||
a1 = _mm_and_si128(a1, a3);
|
||||
a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
a0 = _mm_load_si128((__m128i *)poly);
|
||||
a2 = _mm_and_si128(a1, a3);
|
||||
a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
|
||||
a2 = _mm_and_si128(a2, a3);
|
||||
a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
|
||||
a1 = _mm_xor_si128(a1, a2);
|
||||
crc = _mm_extract_epi32(a1, 1);
|
||||
}
|
||||
#endif
|
||||
#if defined(__SSE4_2__) && defined(__PCLMUL__)
|
||||
if (len >= 64) {
|
||||
_Alignas(__m128) static const uint64_t k1k2[] = {
|
||||
0x0154442bd4,
|
||||
0x01c6e41596,
|
||||
};
|
||||
_Alignas(__m128) static const uint64_t k3k4[] = {
|
||||
0x01751997d0,
|
||||
0x00ccaa009e,
|
||||
};
|
||||
_Alignas(__m128) static const uint64_t k5k0[] = {
|
||||
0x0163cd6124,
|
||||
0x0000000000,
|
||||
};
|
||||
_Alignas(__m128) static const uint64_t poly[] = {
|
||||
0x01db710641,
|
||||
0x01f7011641,
|
||||
};
|
||||
__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
|
||||
x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
|
||||
x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
|
||||
x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
|
||||
x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
|
||||
x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
|
||||
x0 = _mm_load_si128((__m128i *)k1k2);
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
while (len >= 64) {
|
||||
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
|
||||
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
|
||||
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
|
||||
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
|
||||
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
|
||||
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
|
||||
y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
|
||||
y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
|
||||
y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
|
||||
y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x2 = _mm_xor_si128(x2, x6);
|
||||
x3 = _mm_xor_si128(x3, x7);
|
||||
x4 = _mm_xor_si128(x4, x8);
|
||||
x1 = _mm_xor_si128(x1, y5);
|
||||
x2 = _mm_xor_si128(x2, y6);
|
||||
x3 = _mm_xor_si128(x3, y7);
|
||||
x4 = _mm_xor_si128(x4, y8);
|
||||
buf += 64;
|
||||
len -= 64;
|
||||
}
|
||||
x0 = _mm_load_si128((__m128i *)k3k4);
|
||||
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
|
||||
x1 = _mm_xor_si128(x1, x3);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
|
||||
x1 = _mm_xor_si128(x1, x4);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
while (len >= 16) {
|
||||
x2 = _mm_loadu_si128((__m128i *)buf);
|
||||
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x1 = _mm_xor_si128(x1, x5);
|
||||
buf += 16;
|
||||
len -= 16;
|
||||
}
|
||||
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
|
||||
x3 = _mm_setr_epi32(~0, 0, ~0, 0);
|
||||
x1 = _mm_srli_si128(x1, 8);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x0 = _mm_loadl_epi64((__m128i *)k5k0);
|
||||
x2 = _mm_srli_si128(x1, 4);
|
||||
x1 = _mm_and_si128(x1, x3);
|
||||
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
x0 = _mm_load_si128((__m128i *)poly);
|
||||
x2 = _mm_and_si128(x1, x3);
|
||||
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
|
||||
x2 = _mm_and_si128(x2, x3);
|
||||
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
|
||||
x1 = _mm_xor_si128(x1, x2);
|
||||
crc = _mm_extract_epi32(x1, 1);
|
||||
}
|
||||
#endif
|
||||
static uint32_t tab[256];
|
||||
if (!tab[255]) {
|
||||
// generates table for byte-wise crc calculation on the polynomial
|
||||
// x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
|
||||
uint32_t polynomial = 0xedb88320; // bits are reversed
|
||||
for (int d = 0; d < 256; ++d) {
|
||||
uint32_t r = d;
|
||||
for (int i = 0; i < 8; ++i)
|
||||
r = r >> 1 ^ (r & 1 ? polynomial : 0);
|
||||
tab[d] = r;
|
||||
}
|
||||
}
|
||||
for (size_t i = 0; i < len; ++i)
|
||||
crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]];
|
||||
return ~crc & 0xffffffff;
|
||||
}
|
||||
|
||||
4
unet.hpp
4
unet.hpp
@ -396,7 +396,7 @@ public:
|
||||
if (c_concat->ne[3] != x->ne[3]) {
|
||||
c_concat = ggml_repeat(ctx, c_concat, x);
|
||||
}
|
||||
x = ggml_concat(ctx, x, c_concat);
|
||||
x = ggml_concat(ctx, x, c_concat, 2);
|
||||
}
|
||||
|
||||
if (y != NULL) {
|
||||
@ -491,7 +491,7 @@ public:
|
||||
control_offset--;
|
||||
}
|
||||
|
||||
h = ggml_concat(ctx, h, h_skip);
|
||||
h = ggml_concat(ctx, h, h_skip, 2);
|
||||
|
||||
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user