Compare commits

...

4 Commits

Author SHA1 Message Date
Eugene
b6daf5c55b
fix: use PRI64 instead of %i for some log (#269) 2024-06-01 14:01:58 +08:00
leejet
be6cd1a4bf sync: update ggml 2024-06-01 13:44:09 +08:00
Justine Tunney
e1384defca
perf: make crc32 100x faster on x86-64 (#278)
This change makes checkpoints load significantly faster by optimizing
pkzip's cyclic redundancy check. This code was developed by Intel and
Google and Mozilla. See Chromium's zlib codebase for further details.
2024-06-01 12:58:30 +08:00
Phu Tran
814280343c
chore: update artifact actions (#267) 2024-06-01 12:33:13 +08:00
13 changed files with 317 additions and 57 deletions

View File

@ -4,17 +4,36 @@ on:
workflow_dispatch: # allows manual triggering workflow_dispatch: # allows manual triggering
inputs: inputs:
create_release: create_release:
description: 'Create new release' description: "Create new release"
required: true required: true
type: boolean type: boolean
push: push:
branches: branches:
- master - master
- ci - ci
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu'] paths:
[
".github/workflows/**",
"**/CMakeLists.txt",
"**/Makefile",
"**/*.h",
"**/*.hpp",
"**/*.c",
"**/*.cpp",
"**/*.cu",
]
pull_request: pull_request:
types: [opened, synchronize, reopened] types: [opened, synchronize, reopened]
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu'] paths:
[
"**/CMakeLists.txt",
"**/Makefile",
"**/*.h",
"**/*.hpp",
"**/*.c",
"**/*.cpp",
"**/*.cu",
]
env: env:
BRANCH_NAME: ${{ github.head_ref || github.ref_name }} BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
@ -67,12 +86,12 @@ jobs:
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
macOS-latest-cmake: macOS-latest-cmake:
runs-on: macos-latest runs-on: macos-latest
@ -120,29 +139,29 @@ jobs:
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-${{ steps.system-info.outputs.OS_TYPE }}-${{ steps.system-info.outputs.OS_NAME }}-${{ steps.system-info.outputs.OS_VERSION }}-${{ steps.system-info.outputs.CPU_ARCH }}.zip
windows-latest-cmake: windows-latest-cmake:
runs-on: windows-latest runs-on: windows-latest
strategy: strategy:
matrix: matrix:
include: include:
- build: 'noavx' - build: "noavx"
defines: '-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON' defines: "-DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF -DSD_BUILD_SHARED_LIBS=ON"
- build: 'avx2' - build: "avx2"
defines: '-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON' defines: "-DGGML_AVX2=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: 'avx' - build: "avx"
defines: '-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON' defines: "-DGGML_AVX2=OFF -DSD_BUILD_SHARED_LIBS=ON"
- build: 'avx512' - build: "avx512"
defines: '-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON' defines: "-DGGML_AVX512=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: 'cuda12' - build: "cuda12"
defines: '-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON' defines: "-DSD_CUBLAS=ON -DSD_BUILD_SHARED_LIBS=ON"
- build: 'rocm5.5' - build: "rocm5.5"
defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON' defines: '-G Ninja -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DSD_HIPBLAS=ON -DCMAKE_BUILD_TYPE=Release -DAMDGPU_TARGETS="gfx1100;gfx1102;gfx1030" -DSD_BUILD_SHARED_LIBS=ON'
steps: steps:
- name: Clone - name: Clone
@ -156,8 +175,8 @@ jobs:
if: ${{ matrix.build == 'cuda12' }} if: ${{ matrix.build == 'cuda12' }}
uses: Jimver/cuda-toolkit@v0.2.11 uses: Jimver/cuda-toolkit@v0.2.11
with: with:
cuda: '12.2.0' cuda: "12.2.0"
method: 'network' method: "network"
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]' sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
- name: Install rocm-toolkit - name: Install rocm-toolkit
@ -165,7 +184,7 @@ jobs:
if: ${{ matrix.build == 'rocm5.5' }} if: ${{ matrix.build == 'rocm5.5' }}
uses: Cyberhan123/rocm-toolkit@v0.1.0 uses: Cyberhan123/rocm-toolkit@v0.1.0
with: with:
rocm: '5.5.0' rocm: "5.5.0"
- name: Install Ninja - name: Install Ninja
id: install-ninja id: install-ninja
@ -231,15 +250,17 @@ jobs:
- name: Upload Cuda runtime - name: Upload Cuda runtime
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' && matrix.build == 'cuda12' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
name: sd-cudart-sd-bin-win-cu12-x64.zip
path: | path: |
cudart-sd-bin-win-cu12-x64.zip cudart-sd-bin-win-cu12-x64.zip
- name: Upload artifacts - name: Upload artifacts
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }} if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
uses: actions/upload-artifact@v3 uses: actions/upload-artifact@v4
with: with:
name: sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
path: | path: |
sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip sd-${{ env.BRANCH_NAME }}-${{ steps.commit.outputs.short }}-bin-win-${{ matrix.build }}-x64.zip
@ -256,7 +277,11 @@ jobs:
steps: steps:
- name: Download artifacts - name: Download artifacts
id: download-artifact id: download-artifact
uses: actions/download-artifact@v3 uses: actions/download-artifact@v4
with:
path: ./artifact
pattern: sd-*
merge-multiple: true
- name: Get commit hash - name: Get commit hash
id: commit id: commit

View File

@ -34,7 +34,7 @@ option(SD_BUILD_SHARED_LIBS "sd: build shared libs" OFF)
if(SD_CUBLAS) if(SD_CUBLAS)
message("Use CUBLAS as backend stable-diffusion") message("Use CUBLAS as backend stable-diffusion")
set(GGML_CUBLAS ON) set(GGML_CUDA ON)
add_definitions(-DSD_USE_CUBLAS) add_definitions(-DSD_USE_CUBLAS)
endif() endif()

View File

@ -679,8 +679,8 @@ public:
class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim] class_embedding = ggml_repeat(ctx, class_embed_weight, class_embedding); // [N, embed_dim]
class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1] class_embedding = ggml_reshape_4d(ctx, class_embedding, 1, embed_dim, 1, N); // [N, 1, embed_dim, 1]
struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding); // [N, num_positions, embed_dim, 1] struct ggml_tensor* x = ggml_concat(ctx, class_embedding, patch_embedding, 2); // [N, num_positions, embed_dim, 1]
x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim] x = ggml_reshape_3d(ctx, x, embed_dim, num_positions, N); // [N, num_positions, embed_dim]
x = ggml_add(ctx, x, position_embed_weight); x = ggml_add(ctx, x, position_embed_weight);
return x; // [N, num_positions, embed_dim] return x; // [N, num_positions, embed_dim]
} }
@ -1036,7 +1036,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
hidden_states2->ne[3]); hidden_states2->ne[3]);
hidden_states2 = ggml_cont(ctx, ggml_permute(ctx, hidden_states2, 2, 0, 1, 3)); hidden_states2 = ggml_cont(ctx, ggml_permute(ctx, hidden_states2, 2, 0, 1, 3));
hidden_states = ggml_concat(ctx, hidden_states, hidden_states2); // [N, n_token, hidden_size + hidden_size2] hidden_states = ggml_concat(ctx, hidden_states, hidden_states2, 2); // [N, n_token, hidden_size + hidden_size2]
hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 1, 2, 0, 3)); hidden_states = ggml_cont(ctx, ggml_permute(ctx, hidden_states, 1, 2, 0, 3));
} }
@ -1069,7 +1069,7 @@ struct FrozenCLIPEmbedderWithCustomWords : public GGMLModule {
auto token_embed_weight = text_model.get_token_embed_weight(); auto token_embed_weight = text_model.get_token_embed_weight();
token_embed_weight = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]); token_embed_weight = ggml_reshape_3d(compute_ctx, token_embed_weight, token_embed_weight->ne[0], 1, token_embed_weight->ne[1]);
// concatenate custom embeddings // concatenate custom embeddings
embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings); embeddings = ggml_concat(compute_ctx, token_embed_weight, custom_embeddings, 2);
embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]); embeddings = ggml_reshape_2d(compute_ctx, embeddings, embeddings->ne[0], embeddings->ne[2]);
} }

View File

@ -42,13 +42,13 @@ public:
auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]); auto conv5 = std::dynamic_pointer_cast<Conv2d>(blocks["conv5"]);
auto x1 = lrelu(ctx, conv1->forward(ctx, x)); auto x1 = lrelu(ctx, conv1->forward(ctx, x));
auto x_cat = ggml_concat(ctx, x, x1); auto x_cat = ggml_concat(ctx, x, x1, 2);
auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat)); auto x2 = lrelu(ctx, conv2->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x2); x_cat = ggml_concat(ctx, x_cat, x2, 2);
auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat)); auto x3 = lrelu(ctx, conv3->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x3); x_cat = ggml_concat(ctx, x_cat, x3, 2);
auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat)); auto x4 = lrelu(ctx, conv4->forward(ctx, x_cat));
x_cat = ggml_concat(ctx, x_cat, x4); x_cat = ggml_concat(ctx, x_cat, x4, 2);
auto x5 = conv5->forward(ctx, x_cat); auto x5 = conv5->forward(ctx, x_cat);
x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x); x5 = ggml_add(ctx, ggml_scale(ctx, x5, 0.2f), x);

2
ggml

@ -1 +1 @@
Subproject commit 57869ad3b7b1f49ae18e3238b0d69a9467a8f068 Subproject commit 2aae01fd9b8f9399f343cf18f46f38996ef52e2c

View File

@ -917,7 +917,7 @@ public:
return NULL; return NULL;
} }
// it's performing a compute, check if backend isn't cpu // it's performing a compute, check if backend isn't cpu
if (!ggml_backend_is_cpu(backend) && tensor->backend == GGML_BACKEND_TYPE_CPU) { if (!ggml_backend_is_cpu(backend) && (tensor->buffer == NULL || ggml_backend_buffer_is_host(tensor->buffer))) {
// pass input tensors to gpu memory // pass input tensors to gpu memory
auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor); auto backend_tensor = ggml_dup_tensor(compute_ctx, tensor);

View File

@ -571,10 +571,9 @@ void convert_tensor(void* src,
if (dst_type == GGML_TYPE_F16) { if (dst_type == GGML_TYPE_F16) {
ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n); ggml_fp32_to_fp16_row((float*)src, (ggml_fp16_t*)dst, n);
} else { } else {
int64_t hist[16];
std::vector<float> imatrix(n_per_row, 1.0f); // dummy importance matrix std::vector<float> imatrix(n_per_row, 1.0f); // dummy importance matrix
const float* im = imatrix.data(); const float* im = imatrix.data();
ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, hist, im); ggml_quantize_chunk(dst_type, (float*)src, dst, 0, nrows, n_per_row, im);
} }
} else if (dst_type == GGML_TYPE_F32) { } else if (dst_type == GGML_TYPE_F32) {
if (src_type == GGML_TYPE_F16) { if (src_type == GGML_TYPE_F16) {
@ -602,10 +601,9 @@ void convert_tensor(void* src,
if (dst_type == GGML_TYPE_F16) { if (dst_type == GGML_TYPE_F16) {
ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n); ggml_fp32_to_fp16_row((float*)src_data_f32, (ggml_fp16_t*)dst, n);
} else { } else {
int64_t hist[16];
std::vector<float> imatrix(n_per_row, 1.0f); // dummy importance matrix std::vector<float> imatrix(n_per_row, 1.0f); // dummy importance matrix
const float* im = imatrix.data(); const float* im = imatrix.data();
ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, hist, im); ggml_quantize_chunk(dst_type, (float*)src_data_f32, dst, 0, nrows, n_per_row, im);
} }
} }
} }

View File

@ -64,7 +64,7 @@ public:
auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3)); auto prompt_embeds0 = ggml_cont(ctx, ggml_permute(ctx, prompt_embeds, 2, 0, 1, 3));
auto id_embeds0 = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3)); auto id_embeds0 = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
// concat is along dim 2 // concat is along dim 2
auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0); auto stacked_id_embeds = ggml_concat(ctx, prompt_embeds0, id_embeds0, 2);
stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3)); stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 1, 2, 0, 3));
// stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds); // stacked_id_embeds = mlp1.forward(ctx, stacked_id_embeds);
@ -102,12 +102,12 @@ public:
stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3)); stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
if (left && right) { if (left && right) {
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds); stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right); stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
} else if (left) { } else if (left) {
stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds); stacked_id_embeds = ggml_concat(ctx, left, stacked_id_embeds, 2);
} else if (right) { } else if (right) {
stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right); stacked_id_embeds = ggml_concat(ctx, stacked_id_embeds, right, 2);
} }
stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3)); stacked_id_embeds = ggml_cont(ctx, ggml_permute(ctx, stacked_id_embeds, 0, 2, 1, 3));
class_tokens_mask = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask)); class_tokens_mask = ggml_cont(ctx, ggml_transpose(ctx, class_tokens_mask));
@ -146,7 +146,7 @@ struct PhotoMakerIDEncoderBlock : public CLIPVisionModelProjection {
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3)); id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 2, 0, 1, 3));
id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3)); id_embeds_2 = ggml_cont(ctx, ggml_permute(ctx, id_embeds_2, 2, 0, 1, 3));
id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right id_embeds = ggml_concat(ctx, id_embeds, id_embeds_2, 2); // [batch_size, seq_length, 1, 2048] check whether concat at dim 2 is right
id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3)); id_embeds = ggml_cont(ctx, ggml_permute(ctx, id_embeds, 1, 2, 0, 3));
struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx, struct ggml_tensor* updated_prompt_embeds = fuse_module->forward(ctx,

View File

@ -1717,7 +1717,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx,
for (int b = 0; b < batch_count; b++) { for (int b = 0; b < batch_count; b++) {
int64_t sampling_start = ggml_time_ms(); int64_t sampling_start = ggml_time_ms();
int64_t cur_seed = seed + b; int64_t cur_seed = seed + b;
LOG_INFO("generating image: %i/%i - seed %i", b + 1, batch_count, cur_seed); LOG_INFO("generating image: %i/%i - seed %" PRId64, b + 1, batch_count, cur_seed);
sd_ctx->sd->rng->manual_seed(cur_seed); sd_ctx->sd->rng->manual_seed(cur_seed);
struct ggml_tensor* x_t = NULL; struct ggml_tensor* x_t = NULL;

View File

@ -60,12 +60,11 @@ enum sd_type_t {
SD_TYPE_Q4_0 = 2, SD_TYPE_Q4_0 = 2,
SD_TYPE_Q4_1 = 3, SD_TYPE_Q4_1 = 3,
// SD_TYPE_Q4_2 = 4, support has been removed // SD_TYPE_Q4_2 = 4, support has been removed
// SD_TYPE_Q4_3 (5) support has been removed // SD_TYPE_Q4_3 = 5, support has been removed
SD_TYPE_Q5_0 = 6, SD_TYPE_Q5_0 = 6,
SD_TYPE_Q5_1 = 7, SD_TYPE_Q5_1 = 7,
SD_TYPE_Q8_0 = 8, SD_TYPE_Q8_0 = 8,
SD_TYPE_Q8_1 = 9, SD_TYPE_Q8_1 = 9,
// k-quantizations
SD_TYPE_Q2_K = 10, SD_TYPE_Q2_K = 10,
SD_TYPE_Q3_K = 11, SD_TYPE_Q3_K = 11,
SD_TYPE_Q4_K = 12, SD_TYPE_Q4_K = 12,
@ -80,9 +79,13 @@ enum sd_type_t {
SD_TYPE_IQ3_S = 21, SD_TYPE_IQ3_S = 21,
SD_TYPE_IQ2_S = 22, SD_TYPE_IQ2_S = 22,
SD_TYPE_IQ4_XS = 23, SD_TYPE_IQ4_XS = 23,
SD_TYPE_I8, SD_TYPE_I8 = 24,
SD_TYPE_I16, SD_TYPE_I16 = 25,
SD_TYPE_I32, SD_TYPE_I32 = 26,
SD_TYPE_I64 = 27,
SD_TYPE_F64 = 28,
SD_TYPE_IQ1_M = 29,
SD_TYPE_BF16 = 30,
SD_TYPE_COUNT, SD_TYPE_COUNT,
}; };

2
thirdparty/.clang-format vendored Normal file
View File

@ -0,0 +1,2 @@
DisableFormat: true
SortIncludes: Never

232
thirdparty/zip.c vendored
View File

@ -36,6 +36,7 @@
#include <unistd.h> #include <unistd.h>
#endif #endif
#define USE_EXTERNAL_MZCRC
#include "miniz.h" #include "miniz.h"
#include "zip.h" #include "zip.h"
@ -1834,3 +1835,234 @@ int zip_extract(const char *zipname, const char *dir,
return zip_archive_extract(&zip_archive, dir, on_extract, arg); return zip_archive_extract(&zip_archive, dir, on_extract, arg);
} }
#if defined(__SSE4_2__) || defined(__AVX512F__)
#include <immintrin.h>
#endif
// Phil Katz 32-Bit Cyclic Redundancy Check Uber Alles
// Goes 73 GiB/s on an AMD Ryzen Threadripper PRO 7995WX
// "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
// V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
mz_ulong mz_crc32(mz_ulong init, const uint8_t *buf, size_t len) {
uint32_t crc = ~init;
#if defined(__AVX512F__) && defined(__VPCLMULQDQ__) && defined(__PCLMUL__)
if (len >= 256) {
_Alignas(__m512) static const uint64_t k1k2[] = {
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
0x011542778a, 0x01322d1430, 0x011542778a, 0x01322d1430,
};
_Alignas(__m512) static const uint64_t k3k4[] = {
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
0x0154442bd4, 0x01c6e41596, 0x0154442bd4, 0x01c6e41596,
};
_Alignas(__m512) static const uint64_t k5k6[] = {
0x01751997d0,
0x00ccaa009e,
};
_Alignas(__m512) static const uint64_t k7k8[] = {
0x0163cd6124,
0x0000000000,
};
_Alignas(__m512) static const uint64_t poly[] = {
0x01db710641,
0x01f7011641,
};
__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
__m128i a0, a1, a2, a3;
x1 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
x2 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
x3 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
x4 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
x0 = _mm512_load_si512((__m512i *)k1k2);
buf += 256;
len -= 256;
while (len >= 256) {
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
y5 = _mm512_loadu_si512((__m512i *)(buf + 0x00));
y6 = _mm512_loadu_si512((__m512i *)(buf + 0x40));
y7 = _mm512_loadu_si512((__m512i *)(buf + 0x80));
y8 = _mm512_loadu_si512((__m512i *)(buf + 0xC0));
x1 = _mm512_xor_si512(x1, x5);
x2 = _mm512_xor_si512(x2, x6);
x3 = _mm512_xor_si512(x3, x7);
x4 = _mm512_xor_si512(x4, x8);
x1 = _mm512_xor_si512(x1, y5);
x2 = _mm512_xor_si512(x2, y6);
x3 = _mm512_xor_si512(x3, y7);
x4 = _mm512_xor_si512(x4, y8);
buf += 256;
len -= 256;
}
x0 = _mm512_load_si512((__m512i *)k3k4);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x3);
x1 = _mm512_xor_si512(x1, x5);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x4);
x1 = _mm512_xor_si512(x1, x5);
while (len >= 64) {
x2 = _mm512_loadu_si512((__m512i *)buf);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_xor_si512(x1, x2);
x1 = _mm512_xor_si512(x1, x5);
buf += 64;
len -= 64;
}
a0 = _mm_load_si128((__m128i *)k5k6);
a1 = _mm512_extracti32x4_epi32(x1, 0);
a2 = _mm512_extracti32x4_epi32(x1, 1);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm512_extracti32x4_epi32(x1, 2);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm512_extracti32x4_epi32(x1, 3);
a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
a1 = _mm_xor_si128(a1, a3);
a1 = _mm_xor_si128(a1, a2);
a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
a3 = _mm_setr_epi32(~0, 0, ~0, 0);
a1 = _mm_srli_si128(a1, 8);
a1 = _mm_xor_si128(a1, a2);
a0 = _mm_loadl_epi64((__m128i *)k7k8);
a2 = _mm_srli_si128(a1, 4);
a1 = _mm_and_si128(a1, a3);
a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
a0 = _mm_load_si128((__m128i *)poly);
a2 = _mm_and_si128(a1, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
a2 = _mm_and_si128(a2, a3);
a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
a1 = _mm_xor_si128(a1, a2);
crc = _mm_extract_epi32(a1, 1);
}
#endif
#if defined(__SSE4_2__) && defined(__PCLMUL__)
if (len >= 64) {
_Alignas(__m128) static const uint64_t k1k2[] = {
0x0154442bd4,
0x01c6e41596,
};
_Alignas(__m128) static const uint64_t k3k4[] = {
0x01751997d0,
0x00ccaa009e,
};
_Alignas(__m128) static const uint64_t k5k0[] = {
0x0163cd6124,
0x0000000000,
};
_Alignas(__m128) static const uint64_t poly[] = {
0x01db710641,
0x01f7011641,
};
__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
x0 = _mm_load_si128((__m128i *)k1k2);
buf += 64;
len -= 64;
while (len >= 64) {
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_xor_si128(x4, x8);
x1 = _mm_xor_si128(x1, y5);
x2 = _mm_xor_si128(x2, y6);
x3 = _mm_xor_si128(x3, y7);
x4 = _mm_xor_si128(x4, y8);
buf += 64;
len -= 64;
}
x0 = _mm_load_si128((__m128i *)k3k4);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x3);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x4);
x1 = _mm_xor_si128(x1, x5);
while (len >= 16) {
x2 = _mm_loadu_si128((__m128i *)buf);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
buf += 16;
len -= 16;
}
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
x3 = _mm_setr_epi32(~0, 0, ~0, 0);
x1 = _mm_srli_si128(x1, 8);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_loadl_epi64((__m128i *)k5k0);
x2 = _mm_srli_si128(x1, 4);
x1 = _mm_and_si128(x1, x3);
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_load_si128((__m128i *)poly);
x2 = _mm_and_si128(x1, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
x2 = _mm_and_si128(x2, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
crc = _mm_extract_epi32(x1, 1);
}
#endif
static uint32_t tab[256];
if (!tab[255]) {
// generates table for byte-wise crc calculation on the polynomial
// x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1
uint32_t polynomial = 0xedb88320; // bits are reversed
for (int d = 0; d < 256; ++d) {
uint32_t r = d;
for (int i = 0; i < 8; ++i)
r = r >> 1 ^ (r & 1 ? polynomial : 0);
tab[d] = r;
}
}
for (size_t i = 0; i < len; ++i)
crc = crc >> 8 ^ tab[(crc & 255) ^ buf[i]];
return ~crc & 0xffffffff;
}

View File

@ -396,7 +396,7 @@ public:
if (c_concat->ne[3] != x->ne[3]) { if (c_concat->ne[3] != x->ne[3]) {
c_concat = ggml_repeat(ctx, c_concat, x); c_concat = ggml_repeat(ctx, c_concat, x);
} }
x = ggml_concat(ctx, x, c_concat); x = ggml_concat(ctx, x, c_concat, 2);
} }
if (y != NULL) { if (y != NULL) {
@ -491,7 +491,7 @@ public:
control_offset--; control_offset--;
} }
h = ggml_concat(ctx, h, h_skip); h = ggml_concat(ctx, h, h_skip, 2);
std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0"; std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";