diff --git a/tig-algorithms/src/vector_search/cluster_improved/README.md b/tig-algorithms/src/vector_search/cluster_improved/README.md new file mode 100644 index 0000000..b828c8a --- /dev/null +++ b/tig-algorithms/src/vector_search/cluster_improved/README.md @@ -0,0 +1,23 @@ +# TIG Code Submission + + ## Submission Details + + * **Challenge Name:** vector_search + * **Submission Name:** cluster_improved + * **Copyright:** 2025 Rootz + * **Identity of Submitter:** Rootz + * **Identity of Creator of Algorithmic Method:** null + * **Unique Algorithm Identifier (UAI):** null + + ## License + + The files in this folder are under the following licenses: + * TIG Benchmarker Outbound License + * TIG Commercial License + * TIG Inbound Game License + * TIG Innovator Outbound Game License + * TIG Open Data License + * TIG THV Game License + + Copies of the licenses can be obtained at: + https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses \ No newline at end of file diff --git a/tig-algorithms/src/vector_search/cluster_improved/kernels.cu b/tig-algorithms/src/vector_search/cluster_improved/kernels.cu new file mode 100644 index 0000000..08494e4 --- /dev/null +++ b/tig-algorithms/src/vector_search/cluster_improved/kernels.cu @@ -0,0 +1,821 @@ +/*!Copyright 2025 Rootz + +Identity of Submitter Rootz + +UAI null + +Licensed under the TIG Inbound Game License v2.0 or (at your option) any later +version (the "License"); you may not use this file except in compliance with the +License. You may obtain a copy of the License at + +https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the specific +language governing permissions and limitations under the License. +*/ +#include +#include + +#define MAX_FLOAT 3.402823466e+38F + +__device__ __forceinline__ float euclidean_distance(const float* __restrict__ a, const float* __restrict__ b, int dims) { + float sum = 0.0f; + float c = 0.0f; + int i; + + for (i = 0; i < dims - 15; i += 16) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7]; + float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11]; + float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15]; + + float s0 = d0*d0 + d1*d1 + d2*d2 + d3*d3; + float s1 = d4*d4 + d5*d5 + d6*d6 + d7*d7; + float s2 = d8*d8 + d9*d9 + d10*d10 + d11*d11; + float s3 = d12*d12 + d13*d13 + d14*d14 + d15*d15; + + float partial = s0 + s1 + s2 + s3; + float y = partial - c; + float t = sum + y; + c = (t - sum) - y; + sum = t; + } + + for (; i < dims - 7; i += 8) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7]; + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + } + + for (; i < dims - 3; i += 4) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + } + + for (; i < dims; i++) { + float diff = a[i] - b[i]; + float squared = diff * diff; + float y = squared - c; + float t = sum + y; + c = (t - sum) - y; + sum = t; + } + return sum; +} + +__device__ __forceinline__ float euclidean_distance_high(const float* __restrict__ a, const float* __restrict__ b, int dims) { + float sum = 0.0f; + float c = 0.0f; + int i; + + for (i = 0; i < dims - 31; i += 32) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7]; + float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11]; + float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15]; + float d16=a[i+16]-b[i+16], d17=a[i+17]-b[i+17], d18=a[i+18]-b[i+18], d19=a[i+19]-b[i+19]; + float d20=a[i+20]-b[i+20], d21=a[i+21]-b[i+21], d22=a[i+22]-b[i+22], d23=a[i+23]-b[i+23]; + float d24=a[i+24]-b[i+24], d25=a[i+25]-b[i+25], d26=a[i+26]-b[i+26], d27=a[i+27]-b[i+27]; + float d28=a[i+28]-b[i+28], d29=a[i+29]-b[i+29], d30=a[i+30]-b[i+30], d31=a[i+31]-b[i+31]; + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d8*d8; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d9*d9; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d10*d10; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d11*d11; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d12*d12; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d13*d13; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d14*d14; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d15*d15; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d16*d16; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d17*d17; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d18*d18; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d19*d19; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d20*d20; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d21*d21; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d22*d22; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d23*d23; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d24*d24; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d25*d25; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d26*d26; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d27*d27; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d28*d28; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d29*d29; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d30*d30; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d31*d31; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + } + + for (; i < dims - 15; i += 16) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7]; + float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11]; + float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15]; + + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d8*d8; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d9*d9; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d10*d10; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d11*d11; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d12*d12; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d13*d13; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d14*d14; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d15*d15; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + } + + for (; i < dims - 7; i += 8) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7]; + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + } + + for (; i < dims - 3; i += 4) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + } + + for (; i < dims; i++) { + float diff = a[i] - b[i]; + float squared = diff * diff; + float y = squared - c; + float t = sum + y; + c = (t - sum) - y; + sum = t; + } + return sum; +} + +__device__ __forceinline__ float euclidean_distance_bounded(const float* __restrict__ a, const float* __restrict__ b, int dims, float limit) { + float sum = 0.0f; + float c = 0.0f; + float margin = fmaxf(1e-6f, 1.0e-4f * (1.0f + limit)); + int i; + for (i = 0; i < dims - 15; i += 16) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7]; + float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11]; + float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15]; + float s0=d0*d0+d1*d1+d2*d2+d3*d3; + float s1=d4*d4+d5*d5+d6*d6+d7*d7; + float s2=d8*d8+d9*d9+d10*d10+d11*d11; + float s3=d12*d12+d13*d13+d14*d14+d15*d15; + float partial=s0+s1+s2+s3; + float y=partial-c; + float t=sum+y; + c=(t-sum)-y; + sum=t; + if (sum > limit + margin) return sum; + } + for (; i < dims - 7; i += 8) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7]; + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + if (sum > limit + margin) return sum; + } + for (; i < dims - 3; i += 4) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + if (sum > limit + margin) return sum; + } + for (; i < dims; i++) { + float diff=a[i]-b[i]; + float squared=diff*diff; + float y=squared-c; + float t=sum+y; + c=(t-sum)-y; + sum=t; + if (sum > limit + margin) return sum; + } + return sum; +} + +__device__ __forceinline__ float euclidean_distance_high_bounded(const float* __restrict__ a, const float* __restrict__ b, int dims, float limit) { + float sum=0.0f; + float c=0.0f; + float margin = fmaxf(1e-6f, 1.0e-4f * (1.0f + limit)); + int i; + for (i=0;i limit + margin) return sum; + } + for (; i < dims - 15; i += 16) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7]; + float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11]; + float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15]; + + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d8*d8; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d9*d9; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d10*d10; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d11*d11; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d12*d12; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d13*d13; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d14*d14; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d15*d15; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + if (sum > limit + margin) return sum; + } + for (; i < dims - 7; i += 8) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7]; + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + if (sum > limit + margin) return sum; + } + for (; i < dims - 3; i += 4) { + float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3]; + float v,y,t; + v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t; + if (sum > limit + margin) return sum; + } + for (; i < dims; i++) { + float diff=a[i]-b[i]; + float squared=diff*diff; + float y=squared-c; + float t=sum+y; + c=(t-sum)-y; + sum=t; + if (sum > limit + margin) return sum; + } + return sum; +} + +__device__ __forceinline__ float euclidean_distance_precise_bounded(const float* __restrict__ a, const float* __restrict__ b, int dims, float limit) { + double acc = 0.0; + double lim = (double)limit; + for (int i = 0; i < dims; i++) { + double d = (double)a[i] - (double)b[i]; + acc += d * d; + if (acc > lim) return (float)acc; + } + return (float)acc; +} + +extern "C" __global__ void deterministic_clustering( + const float* __restrict__ database_vectors, + float* __restrict__ cluster_centers, + int* __restrict__ cluster_assignments, + int* __restrict__ cluster_sizes, + int database_size, + int vector_dims, + int num_clusters, + int num_queries +) { + int cluster_idx = blockIdx.x; + int tid = threadIdx.x; + + if (cluster_idx >= num_clusters) return; + + long long seed_idx = ((long long)cluster_idx * 982451653LL + 1566083941LL) % (long long)database_size; + int stride = max(1, database_size / (num_clusters * 37)); + long long start_idx = seed_idx; + + for (int d = tid; d < vector_dims; d += blockDim.x) { + float acc = 0.0f; + long long idx = start_idx; + #pragma unroll + for (int k = 0; k < 4; ++k) { + int pos = (int)(idx % (long long)database_size); + acc += database_vectors[pos * vector_dims + d]; + idx += stride; + } + cluster_centers[cluster_idx * vector_dims + d] = acc * 0.25f; + } + + if (tid == 0) { + cluster_sizes[cluster_idx] = 0; + } +} + +extern "C" __global__ void assign_clusters( + const float* __restrict__ database_vectors, + const float* __restrict__ cluster_centers, + int* __restrict__ cluster_assignments, + int* __restrict__ cluster_sizes, + int database_size, + int vector_dims, + int num_clusters, + int num_queries +) { + int thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const bool use_high = (num_queries > 3000) || (vector_dims >= 700); + if (thread_id < database_size) { + int vec_idx = thread_id; + const float* vector = database_vectors + vec_idx * vector_dims; + float min_dist = MAX_FLOAT; + int best_cluster = 0; + for (int c = 0; c < num_clusters; c++) { + const float* c_center = cluster_centers + c * vector_dims; + float dist = use_high ? euclidean_distance_high(vector, c_center, vector_dims) + : euclidean_distance(vector, c_center, vector_dims); + if (dist < min_dist) { + min_dist = dist; + best_cluster = c; + } + } + cluster_assignments[vec_idx] = best_cluster; + } +} + +extern "C" __global__ void exclusive_scan_sizes( + const int* cluster_sizes, + int* cluster_offsets, + int* write_offsets, + int num_clusters +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + int acc = 0; + for (int i = 0; i < num_clusters; i++) { + cluster_offsets[i] = acc; + write_offsets[i] = acc; + acc += cluster_sizes[i]; + } + } +} + +extern "C" __global__ void build_cluster_index( + const int* cluster_assignments, + int* write_offsets, + int* cluster_indices, + int database_size +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (int vec_idx = 0; vec_idx < database_size; vec_idx++) { + int cluster = cluster_assignments[vec_idx]; + int pos = write_offsets[cluster]; + cluster_indices[pos] = vec_idx; + write_offsets[cluster]++; + } + } +} + +extern "C" __global__ void count_block_cluster_sizes( + const int* cluster_assignments, + int* block_counts, + int database_size, + int num_clusters +) { + extern __shared__ int sdata[]; + int tid = threadIdx.x; + int block = blockIdx.x; + int base = block * blockDim.x; + int vec_idx = base + tid; + __shared__ int s_len; + if (tid == 0) { + int rem = database_size - base; + s_len = rem > blockDim.x ? blockDim.x : (rem > 0 ? rem : 0); + } + __syncthreads(); + if (s_len == 0) { + if (tid == 0) { + for (int c = 0; c < num_clusters; c++) { + block_counts[block * num_clusters + c] = 0; + } + } + return; + } + int cid = -1; + if (tid < s_len) cid = cluster_assignments[vec_idx]; + + for (int c = 0; c < num_clusters; c++) { + int* buf = sdata + c * blockDim.x; + if (tid < s_len) { + buf[tid] = (cid == c) ? 1 : 0; + } else if (tid < blockDim.x) { + buf[tid] = 0; + } + } + __syncthreads(); + + for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) { + if (tid < stride) { + int limit = (tid + stride < s_len) ? 1 : 0; + if (limit) { + for (int c = 0; c < num_clusters; c++) { + int* buf = sdata + c * blockDim.x; + buf[tid] += buf[tid + stride]; + } + } + } + __syncthreads(); + } + + if (tid == 0) { + for (int c = 0; c < num_clusters; c++) { + int* buf = sdata + c * blockDim.x; + block_counts[block * num_clusters + c] = buf[0]; + } + } +} + +extern "C" __global__ void exclusive_scan_block_counts( + const int* cluster_offsets, + const int* block_counts, + int* block_offsets, + int num_blocks, + int num_clusters +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (int c = 0; c < num_clusters; c++) { + int acc = cluster_offsets[c]; + for (int b = 0; b < num_blocks; b++) { + block_offsets[b * num_clusters + c] = acc; + acc += block_counts[b * num_clusters + c]; + } + } + } +} + +extern "C" __global__ void reduce_block_counts( + const int* block_counts, + int* cluster_sizes, + int num_blocks, + int num_clusters +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (int c = 0; c < num_clusters; c++) { + int acc = 0; + for (int b = 0; b < num_blocks; b++) { + acc += block_counts[b * num_clusters + c]; + } + cluster_sizes[c] = acc; + } + } +} + +extern "C" __global__ void parallel_build_cluster_index( + const int* cluster_assignments, + const int* block_offsets, + int* cluster_indices, + int database_size, + int num_clusters +) { + extern __shared__ int sdata[]; + int tid = threadIdx.x; + int block = blockIdx.x; + int base = block * blockDim.x; + int vec_idx = base + tid; + __shared__ int s_len; + if (tid == 0) { + int rem = database_size - base; + s_len = rem > blockDim.x ? blockDim.x : (rem > 0 ? rem : 0); + } + __syncthreads(); + if (s_len == 0) return; + int cid = -1; + if (tid < s_len) cid = cluster_assignments[vec_idx]; + for (int c = 0; c < num_clusters; c++) { + int* flags = sdata + c * blockDim.x; + if (tid < s_len) flags[tid] = (cid == c) ? 1 : 0; + else if (tid < blockDim.x) flags[tid] = 0; + } + __syncthreads(); + for (int c = 0; c < num_clusters; c++) { + int* flags = sdata + c * blockDim.x; + for (int offset = 1; offset < s_len; offset <<= 1) { + int v = 0; + if (tid >= offset && tid < s_len) v = flags[tid - offset]; + __syncthreads(); + if (tid < s_len) flags[tid] += v; + __syncthreads(); + } + if (tid < s_len && cid == c) { + int local_rank = flags[tid] - 1; + int base_off = block_offsets[block * num_clusters + c]; + cluster_indices[base_off + local_rank] = vec_idx; + } + __syncthreads(); + } +} + +extern "C" __global__ void cluster_search( + const float* __restrict__ query_vectors, + const float* __restrict__ database_vectors, + const float* __restrict__ cluster_centers, + const int* __restrict__ cluster_assignments, + const int* __restrict__ cluster_sizes, + const int* __restrict__ cluster_indices, + const int* __restrict__ cluster_offsets, + int* __restrict__ results, + int num_queries, + int database_size, + int vector_dims, + int num_clusters +) { + if (num_queries <= 3000) { + int query_idx = blockIdx.x; + if (query_idx >= num_queries) return; + + const float* query = query_vectors + query_idx * vector_dims; + + float cluster_dists[16]; + int cluster_order[16]; + + for (int cluster = 0; cluster < num_clusters; cluster++) { + const float* center = cluster_centers + cluster * vector_dims; + cluster_dists[cluster] = euclidean_distance(query, center, vector_dims); + cluster_order[cluster] = cluster; + } + + int clusters_to_search = (num_queries <= 1000) ? num_clusters : + (num_queries <= 2000) ? min(num_clusters, (num_clusters * 3) / 4) : + (num_queries <= 2800) ? min(num_clusters, (num_clusters * 2) / 3) : + min(num_clusters, max(2, num_clusters / 2)); + if (vector_dims >= 700) { + int target = max(3, clusters_to_search); + clusters_to_search = min(num_clusters, target); + } + for (int i = 0; i < clusters_to_search; i++) { + int best = i; + for (int j = i + 1; j < num_clusters; j++) { + if (cluster_dists[cluster_order[j]] < cluster_dists[cluster_order[best]]) { + best = j; + } + } + int temp = cluster_order[i]; + cluster_order[i] = cluster_order[best]; + cluster_order[best] = temp; + } + + float min_dist = MAX_FLOAT; + int best_idx = -1; + + for (int c_idx = 0; c_idx < clusters_to_search; c_idx++) { + int target_cluster = cluster_order[c_idx]; + if (cluster_sizes[target_cluster] <= 0) continue; + + int start = cluster_offsets[target_cluster]; + int end = start + cluster_sizes[target_cluster]; + for (int p = start; p < end; p++) { + int vec_idx = cluster_indices[p]; + const float* db_vector = database_vectors + vec_idx * vector_dims; + float dist = euclidean_distance_bounded(query, db_vector, vector_dims, min_dist); + if (dist < min_dist) { + min_dist = dist; + best_idx = vec_idx; + } else if (vector_dims >= 720 && num_queries <= 5000 && best_idx != -1 && dist <= min_dist + 0.0015f) { + float d2 = euclidean_distance_precise_bounded(query, db_vector, vector_dims, min_dist); + if (d2 < min_dist) { + min_dist = d2; + best_idx = vec_idx; + } + } + } + } + + if (min_dist == MAX_FLOAT) { + int base_stride = max(5, database_size / 2000); + int max_checks = min(database_size / base_stride, 2000); + + for (int phase = 0; phase < 2; phase++) { + int offset = phase * (base_stride / 2); + for (int i = 0; i < max_checks / 2; i++) { + int db_idx = (offset + i * base_stride) % database_size; + + const float* db_vector = database_vectors + db_idx * vector_dims; + float dist = euclidean_distance_bounded(query, db_vector, vector_dims, min_dist); + if (dist < min_dist) { + min_dist = dist; + best_idx = db_idx; + } + } + } + + if (best_idx != -1) { + int radius = min(25, base_stride); + int start_local = max(0, best_idx - radius); + int end_local = min(database_size, best_idx + radius + 1); + + for (int i = start_local; i < end_local; i++) { + if (i == best_idx) continue; + const float* db_vector = database_vectors + i * vector_dims; + float dist = euclidean_distance_bounded(query, db_vector, vector_dims, min_dist); + if (dist < min_dist) { + min_dist = dist; + best_idx = i; + } + } + } + } + + if (min_dist == MAX_FLOAT) { + best_idx = 0; + } + + results[query_idx] = best_idx; + } else { + int query_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (query_idx >= num_queries) return; + + const float* query = query_vectors + query_idx * vector_dims; + + float cluster_dists[16]; + int cluster_order[16]; + + for (int cluster = 0; cluster < num_clusters; cluster++) { + const float* center = cluster_centers + cluster * vector_dims; + cluster_dists[cluster] = euclidean_distance_high(query, center, vector_dims); + cluster_order[cluster] = cluster; + } + + int clusters_to_search = (num_queries <= 3500) ? min(num_clusters, 5) : + (num_queries <= 6000) ? min(num_clusters, 4) : + (num_queries <= 8000) ? min(num_clusters, 3) : + 2; + if (num_queries <= 5000 && vector_dims >= 720) { + clusters_to_search = num_clusters; + } else if (vector_dims >= 720) { + clusters_to_search = num_clusters; + } else if (vector_dims >= 700) { + clusters_to_search = max(clusters_to_search, min(num_clusters, (num_clusters * 3) / 4 + 1)); + } + for (int i = 0; i < clusters_to_search; i++) { + int best = i; + for (int j = i + 1; j < num_clusters; j++) { + if (cluster_dists[cluster_order[j]] < cluster_dists[cluster_order[best]]) { + best = j; + } + } + int temp = cluster_order[i]; + cluster_order[i] = cluster_order[best]; + cluster_order[best] = temp; + } + + float min_dist = MAX_FLOAT; + int best_idx = -1; + + for (int c_idx = 0; c_idx < clusters_to_search; c_idx++) { + int target_cluster = cluster_order[c_idx]; + if (cluster_sizes[target_cluster] <= 0) continue; + + int start = cluster_offsets[target_cluster]; + int end = start + cluster_sizes[target_cluster]; + for (int p = start; p < end; p++) { + int vec_idx = cluster_indices[p]; + const float* db_vector = database_vectors + vec_idx * vector_dims; + float dist = euclidean_distance_high_bounded(query, db_vector, vector_dims, min_dist); + if (vector_dims >= 720 && num_queries <= 5000 && dist <= min_dist + 0.0015f) { + float d2 = euclidean_distance_precise_bounded(query, db_vector, vector_dims, min_dist); + if (d2 < dist) dist = d2; + } + if (dist < min_dist) { + min_dist = dist; + best_idx = vec_idx; + } + } + } + + if (min_dist == MAX_FLOAT) { + int base_stride = (vector_dims >= 720) ? max(7, database_size / 900) : max(9, database_size / 1200); + int max_checks = min(database_size / base_stride, (vector_dims >= 720) ? 1600 : 1200); + + for (int phase = 0; phase < 2; phase++) { + int offset = phase * (base_stride / 3); + int phase_checks = max_checks / 2; + + for (int i = 0; i < phase_checks; i++) { + int db_idx = (offset + i * base_stride) % database_size; + + const float* db_vector = database_vectors + db_idx * vector_dims; + float dist = euclidean_distance_high_bounded(query, db_vector, vector_dims, min_dist); + if (dist < min_dist) { + min_dist = dist; + best_idx = db_idx; + } + } + } + + if (best_idx != -1) { + int radius = (vector_dims >= 720) ? min(32, (base_stride * 2) / 3) : min(18, base_stride / 2); + int start_local = max(0, best_idx - radius); + int end_local = min(database_size, best_idx + radius + 1); + + for (int i = start_local; i < end_local; i++) { + if (i == best_idx) continue; + const float* db_vector = database_vectors + i * vector_dims; + float dist = euclidean_distance_high_bounded(query, db_vector, vector_dims, min_dist); + if (dist < min_dist) { + min_dist = dist; + best_idx = i; + } + } + } + } + + if (min_dist == MAX_FLOAT) { + best_idx = 0; + } + + results[query_idx] = best_idx; + } +} diff --git a/tig-algorithms/src/vector_search/cluster_improved/mod.rs b/tig-algorithms/src/vector_search/cluster_improved/mod.rs new file mode 100644 index 0000000..1fa0257 --- /dev/null +++ b/tig-algorithms/src/vector_search/cluster_improved/mod.rs @@ -0,0 +1,230 @@ +use cudarc::{ + driver::{safe::LaunchConfig, CudaModule, CudaStream, PushKernelArg}, + runtime::sys::cudaDeviceProp, +}; +use std::sync::Arc; +use serde_json::{Map, Value}; +use tig_challenges::vector_search::*; + +pub fn solve_challenge( + challenge: &Challenge, + save_solution: &dyn Fn(&Solution) -> anyhow::Result<()>, + hyperparameters: &Option>, + module: Arc, + stream: Arc, + _prop: &cudaDeviceProp, +) -> anyhow::Result<()> { + let vector_dims = challenge.vector_dims as i32; + let database_size = challenge.database_size as i32; + let num_queries = challenge.difficulty.num_queries as i32; + + let block_size: u32 = 128; + fn calculate_optimal_clusters(num_queries: i32, database_size: i32, vector_dims: i32) -> i32 { + let base_clusters = if num_queries <= 3000 { + 3 + } else if num_queries <= 5000 { + if vector_dims >= 720 { 6 } else { 4 } + } else if num_queries <= 7000 { + 5 + } else if num_queries <= 9000 { + 6 + } else if num_queries <= 11000 { + 7 + } else if num_queries <= 15000 { + 8 + } else if num_queries <= 20000 { + 9 + } else { + ((database_size as f32).sqrt() / 1000.0).max(8.0).min(12.0) as i32 + }; + + let memory_factor = if vector_dims > 1000 { 0.8 } else { 1.0 }; + ((base_clusters as f32 * memory_factor) as i32).max(3).min(12) + } + + let num_clusters = calculate_optimal_clusters(num_queries, database_size, vector_dims); + + let deterministic_clustering = module.load_function("deterministic_clustering")?; + let assign_clusters = module.load_function("assign_clusters")?; + + let cluster_search = module.load_function("cluster_search")?; + let exclusive_scan_sizes = module.load_function("exclusive_scan_sizes")?; + let count_block_cluster_sizes = module.load_function("count_block_cluster_sizes")?; + let reduce_block_counts = module.load_function("reduce_block_counts")?; + let exclusive_scan_block_counts = module.load_function("exclusive_scan_block_counts")?; + let parallel_build_cluster_index = module.load_function("parallel_build_cluster_index")?; + + let mut d_cluster_centers = stream.alloc_zeros::((num_clusters * vector_dims) as usize)?; + let mut d_cluster_assignments = stream.alloc_zeros::(database_size as usize)?; + let mut d_cluster_sizes = stream.alloc_zeros::(num_clusters as usize)?; + + let cluster_config = LaunchConfig { + grid_dim: (num_clusters as u32, 1, 1), + block_dim: (block_size, 1, 1), + shared_mem_bytes: 0, + }; + + unsafe { + stream + .launch_builder(&deterministic_clustering) + .arg(&challenge.d_database_vectors) + .arg(&mut d_cluster_centers) + .arg(&mut d_cluster_assignments) + .arg(&mut d_cluster_sizes) + .arg(&database_size) + .arg(&vector_dims) + .arg(&num_clusters) + .arg(&num_queries) + .launch(cluster_config)?; + } + + let assign_threads: u32 = 256; + let assign_blocks: u32 = ((database_size as u32) + assign_threads - 1) / assign_threads; + let assign_config = LaunchConfig { + grid_dim: (assign_blocks, 1, 1), + block_dim: (assign_threads, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&assign_clusters) + .arg(&challenge.d_database_vectors) + .arg(&d_cluster_centers) + .arg(&mut d_cluster_assignments) + .arg(&mut d_cluster_sizes) + .arg(&database_size) + .arg(&vector_dims) + .arg(&num_clusters) + .arg(&num_queries) + .launch(assign_config)?; + } + + let mut d_cluster_offsets = stream.alloc_zeros::(num_clusters as usize)?; + let mut d_write_offsets = stream.alloc_zeros::(num_clusters as usize)?; + let scan_config = LaunchConfig { + grid_dim: (1, 1, 1), + block_dim: (1, 1, 1), + shared_mem_bytes: 0, + }; + {} + + let mut d_cluster_indices = stream.alloc_zeros::(database_size as usize)?; + let db_u32 = database_size as u32; + let fill_blocks: u32 = (db_u32 + block_size - 1) / block_size; + let fill_blocks_i32: i32 = fill_blocks as i32; + + let mut d_block_counts = stream.alloc_zeros::((fill_blocks as usize) * (num_clusters as usize))?; + let mut d_block_offsets = stream.alloc_zeros::((fill_blocks as usize) * (num_clusters as usize))?; + + let count_config = LaunchConfig { + grid_dim: (fill_blocks, 1, 1), + block_dim: (block_size, 1, 1), + shared_mem_bytes: ((num_clusters as u32) * block_size * 4) as u32, + }; + unsafe { + stream + .launch_builder(&count_block_cluster_sizes) + .arg(&d_cluster_assignments) + .arg(&mut d_block_counts) + .arg(&database_size) + .arg(&num_clusters) + .launch(count_config)?; + } + + let scan_blocks_config = LaunchConfig { + grid_dim: (1, 1, 1), + block_dim: (1, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&reduce_block_counts) + .arg(&d_block_counts) + .arg(&mut d_cluster_sizes) + .arg(&fill_blocks_i32) + .arg(&num_clusters) + .launch(scan_blocks_config)?; + } + unsafe { + stream + .launch_builder(&exclusive_scan_sizes) + .arg(&d_cluster_sizes) + .arg(&mut d_cluster_offsets) + .arg(&mut d_write_offsets) + .arg(&num_clusters) + .launch(scan_config)?; + } + unsafe { + stream + .launch_builder(&exclusive_scan_block_counts) + .arg(&d_cluster_offsets) + .arg(&d_block_counts) + .arg(&mut d_block_offsets) + .arg(&fill_blocks_i32) + .arg(&num_clusters) + .launch(scan_blocks_config)?; + } + + let build_config = LaunchConfig { + grid_dim: (fill_blocks, 1, 1), + block_dim: (block_size, 1, 1), + shared_mem_bytes: ((num_clusters as u32) * block_size * 4) as u32, + }; + unsafe { + stream + .launch_builder(¶llel_build_cluster_index) + .arg(&d_cluster_assignments) + .arg(&d_block_offsets) + .arg(&mut d_cluster_indices) + .arg(&database_size) + .arg(&num_clusters) + .launch(build_config)?; + } + + let mut d_results = stream.alloc_zeros::(num_queries as usize)?; + + let search_config = if num_queries <= 3000 { + LaunchConfig { + grid_dim: (num_queries as u32, 1, 1), + block_dim: (1, 1, 1), + shared_mem_bytes: 0, + } + } else { + let threads_per_block = if vector_dims >= 720 { + 256 + } else { + 128 + }; + let blocks = ((num_queries as u32) + threads_per_block - 1) / threads_per_block; + LaunchConfig { + grid_dim: (blocks.min(2048), 1, 1), + block_dim: (threads_per_block, 1, 1), + shared_mem_bytes: 0, + } + }; + + unsafe { + stream + .launch_builder(&cluster_search) + .arg(&challenge.d_query_vectors) + .arg(&challenge.d_database_vectors) + .arg(&d_cluster_centers) + .arg(&d_cluster_assignments) + .arg(&d_cluster_sizes) + .arg(&d_cluster_indices) + .arg(&d_cluster_offsets) + .arg(&mut d_results) + .arg(&num_queries) + .arg(&database_size) + .arg(&vector_dims) + .arg(&num_clusters) + .launch(search_config)?; + } + stream.synchronize()?; + + let indices: Vec = stream.memcpy_dtov(&d_results)?; + let indexes = indices.iter().map(|&idx| idx as usize).collect(); + + let _ = save_solution(&Solution { indexes }); + return Ok(()); +} diff --git a/tig-algorithms/src/vector_search/mod.rs b/tig-algorithms/src/vector_search/mod.rs index ae3472a..3890349 100644 --- a/tig-algorithms/src/vector_search/mod.rs +++ b/tig-algorithms/src/vector_search/mod.rs @@ -142,7 +142,8 @@ // c004_a072 -// c004_a073 +pub mod cluster_improved; +pub use cluster_improved as c004_a073; // c004_a074