Submitted vector_search/cluster_improved

2026-02-21 15:37:44 +08:00 · 2025-10-16 12:17:46 +01:00 · 2025-10-16 12:17:46 +01:00 · d51eb6daac
commit d51eb6daac
parent a2fbd8c435
4 changed files with 1076 additions and 1 deletions
--- a/tig-algorithms/src/vector_search/cluster_improved/README.md
+++ b/tig-algorithms/src/vector_search/cluster_improved/README.md
@ -0,0 +1,23 @@
+# TIG Code Submission
+
+    ## Submission Details
+
+    * **Challenge Name:** vector_search
+    * **Submission Name:** cluster_improved
+    * **Copyright:** 2025 Rootz
+    * **Identity of Submitter:** Rootz
+    * **Identity of Creator of Algorithmic Method:** null
+    * **Unique Algorithm Identifier (UAI):** null
+
+    ## License
+
+    The files in this folder are under the following licenses:
+    * TIG Benchmarker Outbound License
+    * TIG Commercial License
+    * TIG Inbound Game License
+    * TIG Innovator Outbound Game License
+    * TIG Open Data License
+    * TIG THV Game License
+
+    Copies of the licenses can be obtained at:  
+    https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses
--- a/tig-algorithms/src/vector_search/cluster_improved/kernels.cu
+++ b/tig-algorithms/src/vector_search/cluster_improved/kernels.cu
@ -0,0 +1,821 @@
+/*!Copyright 2025 Rootz
+
+Identity of Submitter Rootz
+
+UAI null
+
+Licensed under the TIG Inbound Game License v2.0 or (at your option) any later
+version (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the specific
+language governing permissions and limitations under the License.
+*/
+#include <cuda_runtime.h>
+#include <float.h>
+
+#define MAX_FLOAT 3.402823466e+38F
+
+__device__ __forceinline__ float euclidean_distance(const float* __restrict__ a, const float* __restrict__ b, int dims) {
+    float sum = 0.0f;
+    float c = 0.0f;
+    int i;
+    
+    for (i = 0; i < dims - 15; i += 16) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
+        float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
+        float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
+
+        float s0 = d0*d0 + d1*d1 + d2*d2 + d3*d3;
+        float s1 = d4*d4 + d5*d5 + d6*d6 + d7*d7;
+        float s2 = d8*d8 + d9*d9 + d10*d10 + d11*d11;
+        float s3 = d12*d12 + d13*d13 + d14*d14 + d15*d15;
+
+        float partial = s0 + s1 + s2 + s3;
+        float y = partial - c;
+        float t = sum + y;
+        c = (t - sum) - y;
+        sum = t;
+    }
+
+    for (; i < dims - 7; i += 8) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+    }
+
+    for (; i < dims - 3; i += 4) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+    }
+
+    for (; i < dims; i++) {
+        float diff = a[i] - b[i];
+        float squared = diff * diff;
+        float y = squared - c;
+        float t = sum + y;
+        c = (t - sum) - y;
+        sum = t;
+    }
+    return sum;
+}
+
+__device__ __forceinline__ float euclidean_distance_high(const float* __restrict__ a, const float* __restrict__ b, int dims) {
+    float sum = 0.0f;
+    float c = 0.0f;
+    int i;
+
+    for (i = 0; i < dims - 31; i += 32) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
+        float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
+        float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
+        float d16=a[i+16]-b[i+16], d17=a[i+17]-b[i+17], d18=a[i+18]-b[i+18], d19=a[i+19]-b[i+19];
+        float d20=a[i+20]-b[i+20], d21=a[i+21]-b[i+21], d22=a[i+22]-b[i+22], d23=a[i+23]-b[i+23];
+        float d24=a[i+24]-b[i+24], d25=a[i+25]-b[i+25], d26=a[i+26]-b[i+26], d27=a[i+27]-b[i+27];
+        float d28=a[i+28]-b[i+28], d29=a[i+29]-b[i+29], d30=a[i+30]-b[i+30], d31=a[i+31]-b[i+31];
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d8*d8; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d9*d9; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d10*d10; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d11*d11; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d12*d12; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d13*d13; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d14*d14; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d15*d15; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d16*d16; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d17*d17; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d18*d18; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d19*d19; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d20*d20; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d21*d21; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d22*d22; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d23*d23; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d24*d24; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d25*d25; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d26*d26; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d27*d27; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d28*d28; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d29*d29; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d30*d30; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d31*d31; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+    }
+
+    for (; i < dims - 15; i += 16) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
+        float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
+        float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
+
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d8*d8; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d9*d9; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d10*d10; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d11*d11; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d12*d12; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d13*d13; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d14*d14; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d15*d15; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+    }
+
+    for (; i < dims - 7; i += 8) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+    }
+
+    for (; i < dims - 3; i += 4) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+    }
+
+    for (; i < dims; i++) {
+        float diff = a[i] - b[i];
+        float squared = diff * diff;
+        float y = squared - c;
+        float t = sum + y;
+        c = (t - sum) - y;
+        sum = t;
+    }
+    return sum;
+}
+
+__device__ __forceinline__ float euclidean_distance_bounded(const float* __restrict__ a, const float* __restrict__ b, int dims, float limit) {
+    float sum = 0.0f;
+    float c = 0.0f;
+    float margin = fmaxf(1e-6f, 1.0e-4f * (1.0f + limit));
+    int i;
+    for (i = 0; i < dims - 15; i += 16) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
+        float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
+        float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
+        float s0=d0*d0+d1*d1+d2*d2+d3*d3;
+        float s1=d4*d4+d5*d5+d6*d6+d7*d7;
+        float s2=d8*d8+d9*d9+d10*d10+d11*d11;
+        float s3=d12*d12+d13*d13+d14*d14+d15*d15;
+        float partial=s0+s1+s2+s3;
+        float y=partial-c;
+        float t=sum+y;
+        c=(t-sum)-y;
+        sum=t;
+        if (sum > limit + margin) return sum;
+    }
+    for (; i < dims - 7; i += 8) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        if (sum > limit + margin) return sum;
+    }
+    for (; i < dims - 3; i += 4) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        if (sum > limit + margin) return sum;
+    }
+    for (; i < dims; i++) {
+        float diff=a[i]-b[i];
+        float squared=diff*diff;
+        float y=squared-c;
+        float t=sum+y;
+        c=(t-sum)-y;
+        sum=t;
+        if (sum > limit + margin) return sum;
+    }
+    return sum;
+}
+
+__device__ __forceinline__ float euclidean_distance_high_bounded(const float* __restrict__ a, const float* __restrict__ b, int dims, float limit) {
+    float sum=0.0f;
+    float c=0.0f;
+    float margin = fmaxf(1e-6f, 1.0e-4f * (1.0f + limit));
+    int i;
+    for (i=0;i<dims-31;i+=32){
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
+        float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
+        float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
+        float d16=a[i+16]-b[i+16], d17=a[i+17]-b[i+17], d18=a[i+18]-b[i+18], d19=a[i+19]-b[i+19];
+        float d20=a[i+20]-b[i+20], d21=a[i+21]-b[i+21], d22=a[i+22]-b[i+22], d23=a[i+23]-b[i+23];
+        float d24=a[i+24]-b[i+24], d25=a[i+25]-b[i+25], d26=a[i+26]-b[i+26], d27=a[i+27]-b[i+27];
+        float d28=a[i+28]-b[i+28], d29=a[i+29]-b[i+29], d30=a[i+30]-b[i+30], d31=a[i+31]-b[i+31];
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d8*d8; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d9*d9; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d10*d10; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d11*d11; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d12*d12; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d13*d13; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d14*d14; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d15*d15; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d16*d16; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d17*d17; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d18*d18; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d19*d19; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d20*d20; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d21*d21; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d22*d22; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d23*d23; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d24*d24; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d25*d25; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d26*d26; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d27*d27; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d28*d28; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d29*d29; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d30*d30; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d31*d31; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        if (sum > limit + margin) return sum;
+    }
+    for (; i < dims - 15; i += 16) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
+        float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
+        float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
+
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d8*d8; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d9*d9; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d10*d10; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d11*d11; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d12*d12; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d13*d13; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d14*d14; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d15*d15; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        if (sum > limit + margin) return sum;
+    }
+    for (; i < dims - 7; i += 8) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        if (sum > limit + margin) return sum;
+    }
+    for (; i < dims - 3; i += 4) {
+        float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
+        float v,y,t;
+        v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
+        if (sum > limit + margin) return sum;
+    }
+    for (; i < dims; i++) {
+        float diff=a[i]-b[i];
+        float squared=diff*diff;
+        float y=squared-c;
+        float t=sum+y;
+        c=(t-sum)-y;
+        sum=t;
+        if (sum > limit + margin) return sum;
+    }
+    return sum;
+}
+
+__device__ __forceinline__ float euclidean_distance_precise_bounded(const float* __restrict__ a, const float* __restrict__ b, int dims, float limit) {
+    double acc = 0.0;
+    double lim = (double)limit;
+    for (int i = 0; i < dims; i++) {
+        double d = (double)a[i] - (double)b[i];
+        acc += d * d;
+        if (acc > lim) return (float)acc;
+    }
+    return (float)acc;
+}
+
+extern "C" __global__ void deterministic_clustering(
+    const float* __restrict__ database_vectors,
+    float* __restrict__ cluster_centers,
+    int* __restrict__ cluster_assignments,
+    int* __restrict__ cluster_sizes,
+    int database_size,
+    int vector_dims,
+    int num_clusters,
+    int num_queries
+) {
+    int cluster_idx = blockIdx.x;
+    int tid = threadIdx.x;
+
+    if (cluster_idx >= num_clusters) return;
+
+    long long seed_idx = ((long long)cluster_idx * 982451653LL + 1566083941LL) % (long long)database_size;
+    int stride = max(1, database_size / (num_clusters * 37));
+    long long start_idx = seed_idx;
+
+    for (int d = tid; d < vector_dims; d += blockDim.x) {
+        float acc = 0.0f;
+        long long idx = start_idx;
+        #pragma unroll
+        for (int k = 0; k < 4; ++k) {
+            int pos = (int)(idx % (long long)database_size);
+            acc += database_vectors[pos * vector_dims + d];
+            idx += stride;
+        }
+        cluster_centers[cluster_idx * vector_dims + d] = acc * 0.25f;
+    }
+
+    if (tid == 0) {
+        cluster_sizes[cluster_idx] = 0;
+    }
+}
+
+extern "C" __global__ void assign_clusters(
+    const float* __restrict__ database_vectors,
+    const float* __restrict__ cluster_centers,
+    int* __restrict__ cluster_assignments,
+    int* __restrict__ cluster_sizes,
+    int database_size,
+    int vector_dims,
+    int num_clusters,
+    int num_queries
+) {
+    int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+    const bool use_high = (num_queries > 3000) || (vector_dims >= 700);
+    if (thread_id < database_size) {
+        int vec_idx = thread_id;
+        const float* vector = database_vectors + vec_idx * vector_dims;
+        float min_dist = MAX_FLOAT;
+        int best_cluster = 0;
+        for (int c = 0; c < num_clusters; c++) {
+            const float* c_center = cluster_centers + c * vector_dims;
+            float dist = use_high ? euclidean_distance_high(vector, c_center, vector_dims)
+                                  : euclidean_distance(vector, c_center, vector_dims);
+            if (dist < min_dist) {
+                min_dist = dist;
+                best_cluster = c;
+            }
+        }
+        cluster_assignments[vec_idx] = best_cluster;
+    }
+}
+
+extern "C" __global__ void exclusive_scan_sizes(
+    const int* cluster_sizes,
+    int* cluster_offsets,
+    int* write_offsets,
+    int num_clusters
+) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        int acc = 0;
+        for (int i = 0; i < num_clusters; i++) {
+            cluster_offsets[i] = acc;
+            write_offsets[i] = acc;
+            acc += cluster_sizes[i];
+        }
+    }
+}
+
+extern "C" __global__ void build_cluster_index(
+    const int* cluster_assignments,
+    int* write_offsets,
+    int* cluster_indices,
+    int database_size
+) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        for (int vec_idx = 0; vec_idx < database_size; vec_idx++) {
+            int cluster = cluster_assignments[vec_idx];
+            int pos = write_offsets[cluster];
+            cluster_indices[pos] = vec_idx;
+            write_offsets[cluster]++;
+        }
+    }
+}
+
+extern "C" __global__ void count_block_cluster_sizes(
+    const int* cluster_assignments,
+    int* block_counts,
+    int database_size,
+    int num_clusters
+) {
+    extern __shared__ int sdata[];
+    int tid = threadIdx.x;
+    int block = blockIdx.x;
+    int base = block * blockDim.x;
+    int vec_idx = base + tid;
+    __shared__ int s_len;
+    if (tid == 0) {
+        int rem = database_size - base;
+        s_len = rem > blockDim.x ? blockDim.x : (rem > 0 ? rem : 0);
+    }
+    __syncthreads();
+    if (s_len == 0) {
+        if (tid == 0) {
+            for (int c = 0; c < num_clusters; c++) {
+                block_counts[block * num_clusters + c] = 0;
+            }
+        }
+        return;
+    }
+    int cid = -1;
+    if (tid < s_len) cid = cluster_assignments[vec_idx];
+
+    for (int c = 0; c < num_clusters; c++) {
+        int* buf = sdata + c * blockDim.x;
+        if (tid < s_len) {
+            buf[tid] = (cid == c) ? 1 : 0;
+        } else if (tid < blockDim.x) {
+            buf[tid] = 0;
+        }
+    }
+    __syncthreads();
+
+    for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            int limit = (tid + stride < s_len) ? 1 : 0;
+            if (limit) {
+                for (int c = 0; c < num_clusters; c++) {
+                    int* buf = sdata + c * blockDim.x;
+                    buf[tid] += buf[tid + stride];
+                }
+            }
+        }
+        __syncthreads();
+    }
+
+    if (tid == 0) {
+        for (int c = 0; c < num_clusters; c++) {
+            int* buf = sdata + c * blockDim.x;
+            block_counts[block * num_clusters + c] = buf[0];
+        }
+    }
+}
+
+extern "C" __global__ void exclusive_scan_block_counts(
+    const int* cluster_offsets,
+    const int* block_counts,
+    int* block_offsets,
+    int num_blocks,
+    int num_clusters
+) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        for (int c = 0; c < num_clusters; c++) {
+            int acc = cluster_offsets[c];
+            for (int b = 0; b < num_blocks; b++) {
+                block_offsets[b * num_clusters + c] = acc;
+                acc += block_counts[b * num_clusters + c];
+            }
+        }
+    }
+}
+
+extern "C" __global__ void reduce_block_counts(
+    const int* block_counts,
+    int* cluster_sizes,
+    int num_blocks,
+    int num_clusters
+) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        for (int c = 0; c < num_clusters; c++) {
+            int acc = 0;
+            for (int b = 0; b < num_blocks; b++) {
+                acc += block_counts[b * num_clusters + c];
+            }
+            cluster_sizes[c] = acc;
+        }
+    }
+}
+
+extern "C" __global__ void parallel_build_cluster_index(
+    const int* cluster_assignments,
+    const int* block_offsets,
+    int* cluster_indices,
+    int database_size,
+    int num_clusters
+) {
+    extern __shared__ int sdata[];
+    int tid = threadIdx.x;
+    int block = blockIdx.x;
+    int base = block * blockDim.x;
+    int vec_idx = base + tid;
+    __shared__ int s_len;
+    if (tid == 0) {
+        int rem = database_size - base;
+        s_len = rem > blockDim.x ? blockDim.x : (rem > 0 ? rem : 0);
+    }
+    __syncthreads();
+    if (s_len == 0) return;
+    int cid = -1;
+    if (tid < s_len) cid = cluster_assignments[vec_idx];
+    for (int c = 0; c < num_clusters; c++) {
+        int* flags = sdata + c * blockDim.x;
+        if (tid < s_len) flags[tid] = (cid == c) ? 1 : 0;
+        else if (tid < blockDim.x) flags[tid] = 0;
+    }
+    __syncthreads();
+    for (int c = 0; c < num_clusters; c++) {
+        int* flags = sdata + c * blockDim.x;
+        for (int offset = 1; offset < s_len; offset <<= 1) {
+            int v = 0;
+            if (tid >= offset && tid < s_len) v = flags[tid - offset];
+            __syncthreads();
+            if (tid < s_len) flags[tid] += v;
+            __syncthreads();
+        }
+        if (tid < s_len && cid == c) {
+            int local_rank = flags[tid] - 1;
+            int base_off = block_offsets[block * num_clusters + c];
+            cluster_indices[base_off + local_rank] = vec_idx;
+        }
+        __syncthreads();
+    }
+}
+
+extern "C" __global__ void cluster_search(
+    const float* __restrict__ query_vectors,
+    const float* __restrict__ database_vectors,
+    const float* __restrict__ cluster_centers,
+    const int* __restrict__ cluster_assignments,
+    const int* __restrict__ cluster_sizes,
+    const int* __restrict__ cluster_indices,
+    const int* __restrict__ cluster_offsets,
+    int* __restrict__ results,
+    int num_queries,
+    int database_size,
+    int vector_dims,
+    int num_clusters
+) {
+    if (num_queries <= 3000) {
+        int query_idx = blockIdx.x;
+        if (query_idx >= num_queries) return;
+
+        const float* query = query_vectors + query_idx * vector_dims;
+
+        float cluster_dists[16];
+        int cluster_order[16];
+
+        for (int cluster = 0; cluster < num_clusters; cluster++) {
+            const float* center = cluster_centers + cluster * vector_dims;
+            cluster_dists[cluster] = euclidean_distance(query, center, vector_dims);
+            cluster_order[cluster] = cluster;
+        }
+
+        int clusters_to_search = (num_queries <= 1000) ? num_clusters :
+                                (num_queries <= 2000) ? min(num_clusters, (num_clusters * 3) / 4) :
+                                (num_queries <= 2800) ? min(num_clusters, (num_clusters * 2) / 3) :
+                                min(num_clusters, max(2, num_clusters / 2));
+        if (vector_dims >= 700) {
+            int target = max(3, clusters_to_search);
+            clusters_to_search = min(num_clusters, target);
+        }
+        for (int i = 0; i < clusters_to_search; i++) {
+            int best = i;
+            for (int j = i + 1; j < num_clusters; j++) {
+                if (cluster_dists[cluster_order[j]] < cluster_dists[cluster_order[best]]) {
+                    best = j;
+                }
+            }
+            int temp = cluster_order[i];
+            cluster_order[i] = cluster_order[best];
+            cluster_order[best] = temp;
+        }
+
+        float min_dist = MAX_FLOAT;
+        int best_idx = -1;
+
+        for (int c_idx = 0; c_idx < clusters_to_search; c_idx++) {
+            int target_cluster = cluster_order[c_idx];
+            if (cluster_sizes[target_cluster] <= 0) continue;
+
+            int start = cluster_offsets[target_cluster];
+            int end = start + cluster_sizes[target_cluster];
+            for (int p = start; p < end; p++) {
+                int vec_idx = cluster_indices[p];
+                const float* db_vector = database_vectors + vec_idx * vector_dims;
+                float dist = euclidean_distance_bounded(query, db_vector, vector_dims, min_dist);
+                if (dist < min_dist) {
+                    min_dist = dist;
+                    best_idx = vec_idx;
+                } else if (vector_dims >= 720 && num_queries <= 5000 && best_idx != -1 && dist <= min_dist + 0.0015f) {
+                    float d2 = euclidean_distance_precise_bounded(query, db_vector, vector_dims, min_dist);
+                    if (d2 < min_dist) {
+                        min_dist = d2;
+                        best_idx = vec_idx;
+                    }
+                }
+            }
+        }
+
+        if (min_dist == MAX_FLOAT) {
+            int base_stride = max(5, database_size / 2000);
+            int max_checks = min(database_size / base_stride, 2000);
+
+            for (int phase = 0; phase < 2; phase++) {
+                int offset = phase * (base_stride / 2);
+                for (int i = 0; i < max_checks / 2; i++) {
+                    int db_idx = (offset + i * base_stride) % database_size;
+
+                    const float* db_vector = database_vectors + db_idx * vector_dims;
+                    float dist = euclidean_distance_bounded(query, db_vector, vector_dims, min_dist);
+                    if (dist < min_dist) {
+                        min_dist = dist;
+                        best_idx = db_idx;
+                    }
+                }
+            }
+
+            if (best_idx != -1) {
+                int radius = min(25, base_stride);
+                int start_local = max(0, best_idx - radius);
+                int end_local = min(database_size, best_idx + radius + 1);
+
+                for (int i = start_local; i < end_local; i++) {
+                    if (i == best_idx) continue;
+                    const float* db_vector = database_vectors + i * vector_dims;
+                    float dist = euclidean_distance_bounded(query, db_vector, vector_dims, min_dist);
+                    if (dist < min_dist) {
+                        min_dist = dist;
+                        best_idx = i;
+                    }
+                }
+            }
+        }
+
+        if (min_dist == MAX_FLOAT) {
+            best_idx = 0;
+        }
+
+        results[query_idx] = best_idx;
+    } else {
+        int query_idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (query_idx >= num_queries) return;
+
+        const float* query = query_vectors + query_idx * vector_dims;
+
+        float cluster_dists[16];
+        int cluster_order[16];
+
+        for (int cluster = 0; cluster < num_clusters; cluster++) {
+            const float* center = cluster_centers + cluster * vector_dims;
+            cluster_dists[cluster] = euclidean_distance_high(query, center, vector_dims);
+            cluster_order[cluster] = cluster;
+        }
+
+        int clusters_to_search = (num_queries <= 3500) ? min(num_clusters, 5) :
+                                (num_queries <= 6000) ? min(num_clusters, 4) :
+                                (num_queries <= 8000) ? min(num_clusters, 3) :
+                                2;
+        if (num_queries <= 5000 && vector_dims >= 720) {
+            clusters_to_search = num_clusters;
+        } else if (vector_dims >= 720) {
+            clusters_to_search = num_clusters;
+        } else if (vector_dims >= 700) {
+            clusters_to_search = max(clusters_to_search, min(num_clusters, (num_clusters * 3) / 4 + 1));
+        }
+        for (int i = 0; i < clusters_to_search; i++) {
+            int best = i;
+            for (int j = i + 1; j < num_clusters; j++) {
+                if (cluster_dists[cluster_order[j]] < cluster_dists[cluster_order[best]]) {
+                    best = j;
+                }
+            }
+            int temp = cluster_order[i];
+            cluster_order[i] = cluster_order[best];
+            cluster_order[best] = temp;
+        }
+
+        float min_dist = MAX_FLOAT;
+        int best_idx = -1;
+
+        for (int c_idx = 0; c_idx < clusters_to_search; c_idx++) {
+            int target_cluster = cluster_order[c_idx];
+            if (cluster_sizes[target_cluster] <= 0) continue;
+
+            int start = cluster_offsets[target_cluster];
+            int end = start + cluster_sizes[target_cluster];
+            for (int p = start; p < end; p++) {
+                int vec_idx = cluster_indices[p];
+                const float* db_vector = database_vectors + vec_idx * vector_dims;
+                float dist = euclidean_distance_high_bounded(query, db_vector, vector_dims, min_dist);
+                if (vector_dims >= 720 && num_queries <= 5000 && dist <= min_dist + 0.0015f) {
+                    float d2 = euclidean_distance_precise_bounded(query, db_vector, vector_dims, min_dist);
+                    if (d2 < dist) dist = d2;
+                }
+                if (dist < min_dist) {
+                    min_dist = dist;
+                    best_idx = vec_idx;
+                }
+            }
+        }
+
+        if (min_dist == MAX_FLOAT) {
+            int base_stride = (vector_dims >= 720) ? max(7, database_size / 900) : max(9, database_size / 1200);
+            int max_checks = min(database_size / base_stride, (vector_dims >= 720) ? 1600 : 1200);
+
+            for (int phase = 0; phase < 2; phase++) {
+                int offset = phase * (base_stride / 3);
+                int phase_checks = max_checks / 2;
+
+                for (int i = 0; i < phase_checks; i++) {
+                    int db_idx = (offset + i * base_stride) % database_size;
+
+                    const float* db_vector = database_vectors + db_idx * vector_dims;
+                    float dist = euclidean_distance_high_bounded(query, db_vector, vector_dims, min_dist);
+                    if (dist < min_dist) {
+                        min_dist = dist;
+                        best_idx = db_idx;
+                    }
+                }
+            }
+
+            if (best_idx != -1) {
+                int radius = (vector_dims >= 720) ? min(32, (base_stride * 2) / 3) : min(18, base_stride / 2);
+                int start_local = max(0, best_idx - radius);
+                int end_local = min(database_size, best_idx + radius + 1);
+
+                for (int i = start_local; i < end_local; i++) {
+                    if (i == best_idx) continue;
+                    const float* db_vector = database_vectors + i * vector_dims;
+                    float dist = euclidean_distance_high_bounded(query, db_vector, vector_dims, min_dist);
+                    if (dist < min_dist) {
+                        min_dist = dist;
+                        best_idx = i;
+                    }
+                }
+            }
+        }
+
+        if (min_dist == MAX_FLOAT) {
+            best_idx = 0;
+        }
+
+        results[query_idx] = best_idx;
+    }
+}
--- a/tig-algorithms/src/vector_search/cluster_improved/mod.rs
+++ b/tig-algorithms/src/vector_search/cluster_improved/mod.rs
@ -0,0 +1,230 @@
+use cudarc::{
+    driver::{safe::LaunchConfig, CudaModule, CudaStream, PushKernelArg},
+    runtime::sys::cudaDeviceProp,
+};
+use std::sync::Arc;
+use serde_json::{Map, Value};
+use tig_challenges::vector_search::*;
+
+pub fn solve_challenge(
+    challenge: &Challenge,
+    save_solution: &dyn Fn(&Solution) -> anyhow::Result<()>,
+    hyperparameters: &Option<Map<String, Value>>,
+    module: Arc<CudaModule>,
+    stream: Arc<CudaStream>,
+    _prop: &cudaDeviceProp,
+) -> anyhow::Result<()> {
+    let vector_dims = challenge.vector_dims as i32;
+    let database_size = challenge.database_size as i32;
+    let num_queries = challenge.difficulty.num_queries as i32;
+
+    let block_size: u32 = 128;
+    fn calculate_optimal_clusters(num_queries: i32, database_size: i32, vector_dims: i32) -> i32 {
+        let base_clusters = if num_queries <= 3000 {
+            3
+        } else if num_queries <= 5000 {
+            if vector_dims >= 720 { 6 } else { 4 }
+        } else if num_queries <= 7000 {
+            5
+        } else if num_queries <= 9000 {
+            6
+        } else if num_queries <= 11000 {
+            7
+        } else if num_queries <= 15000 {
+            8
+        } else if num_queries <= 20000 {
+            9
+        } else {            
+            ((database_size as f32).sqrt() / 1000.0).max(8.0).min(12.0) as i32
+        };
+        
+        let memory_factor = if vector_dims > 1000 { 0.8 } else { 1.0 };
+        ((base_clusters as f32 * memory_factor) as i32).max(3).min(12)
+    }
+    
+    let num_clusters = calculate_optimal_clusters(num_queries, database_size, vector_dims);
+
+    let deterministic_clustering = module.load_function("deterministic_clustering")?;
+    let assign_clusters = module.load_function("assign_clusters")?;
+
+    let cluster_search = module.load_function("cluster_search")?;
+    let exclusive_scan_sizes = module.load_function("exclusive_scan_sizes")?;
+    let count_block_cluster_sizes = module.load_function("count_block_cluster_sizes")?;
+    let reduce_block_counts = module.load_function("reduce_block_counts")?;
+    let exclusive_scan_block_counts = module.load_function("exclusive_scan_block_counts")?;
+    let parallel_build_cluster_index = module.load_function("parallel_build_cluster_index")?;
+
+    let mut d_cluster_centers = stream.alloc_zeros::<f32>((num_clusters * vector_dims) as usize)?;
+    let mut d_cluster_assignments = stream.alloc_zeros::<i32>(database_size as usize)?;
+    let mut d_cluster_sizes = stream.alloc_zeros::<i32>(num_clusters as usize)?;
+
+    let cluster_config = LaunchConfig {
+        grid_dim: (num_clusters as u32, 1, 1),
+        block_dim: (block_size, 1, 1),
+        shared_mem_bytes: 0,
+    };
+
+    unsafe {
+        stream
+            .launch_builder(&deterministic_clustering)
+            .arg(&challenge.d_database_vectors)
+            .arg(&mut d_cluster_centers)
+            .arg(&mut d_cluster_assignments)
+            .arg(&mut d_cluster_sizes)
+            .arg(&database_size)
+            .arg(&vector_dims)
+            .arg(&num_clusters)
+            .arg(&num_queries)
+            .launch(cluster_config)?;
+    }
+
+    let assign_threads: u32 = 256;
+    let assign_blocks: u32 = ((database_size as u32) + assign_threads - 1) / assign_threads;
+    let assign_config = LaunchConfig {
+        grid_dim: (assign_blocks, 1, 1),
+        block_dim: (assign_threads, 1, 1),
+        shared_mem_bytes: 0,
+    };
+    unsafe {
+        stream
+            .launch_builder(&assign_clusters)
+            .arg(&challenge.d_database_vectors)
+            .arg(&d_cluster_centers)
+            .arg(&mut d_cluster_assignments)
+            .arg(&mut d_cluster_sizes)
+            .arg(&database_size)
+            .arg(&vector_dims)
+            .arg(&num_clusters)
+            .arg(&num_queries)
+            .launch(assign_config)?;
+    }
+
+    let mut d_cluster_offsets = stream.alloc_zeros::<i32>(num_clusters as usize)?;
+    let mut d_write_offsets = stream.alloc_zeros::<i32>(num_clusters as usize)?;
+    let scan_config = LaunchConfig {
+        grid_dim: (1, 1, 1),
+        block_dim: (1, 1, 1),
+        shared_mem_bytes: 0,
+    };
+    {}
+
+    let mut d_cluster_indices = stream.alloc_zeros::<i32>(database_size as usize)?;
+    let db_u32 = database_size as u32;
+    let fill_blocks: u32 = (db_u32 + block_size - 1) / block_size;
+    let fill_blocks_i32: i32 = fill_blocks as i32;
+
+    let mut d_block_counts = stream.alloc_zeros::<i32>((fill_blocks as usize) * (num_clusters as usize))?;
+    let mut d_block_offsets = stream.alloc_zeros::<i32>((fill_blocks as usize) * (num_clusters as usize))?;
+
+    let count_config = LaunchConfig {
+        grid_dim: (fill_blocks, 1, 1),
+        block_dim: (block_size, 1, 1),
+        shared_mem_bytes: ((num_clusters as u32) * block_size * 4) as u32,
+    };
+    unsafe {
+        stream
+            .launch_builder(&count_block_cluster_sizes)
+            .arg(&d_cluster_assignments)
+            .arg(&mut d_block_counts)
+            .arg(&database_size)
+            .arg(&num_clusters)
+            .launch(count_config)?;
+    }
+
+    let scan_blocks_config = LaunchConfig {
+        grid_dim: (1, 1, 1),
+        block_dim: (1, 1, 1),
+        shared_mem_bytes: 0,
+    };
+    unsafe {
+        stream
+            .launch_builder(&reduce_block_counts)
+            .arg(&d_block_counts)
+            .arg(&mut d_cluster_sizes)
+            .arg(&fill_blocks_i32)
+            .arg(&num_clusters)
+            .launch(scan_blocks_config)?;
+    }
+    unsafe {
+        stream
+            .launch_builder(&exclusive_scan_sizes)
+            .arg(&d_cluster_sizes)
+            .arg(&mut d_cluster_offsets)
+            .arg(&mut d_write_offsets)
+            .arg(&num_clusters)
+            .launch(scan_config)?;
+    }
+    unsafe {
+        stream
+            .launch_builder(&exclusive_scan_block_counts)
+            .arg(&d_cluster_offsets)
+            .arg(&d_block_counts)
+            .arg(&mut d_block_offsets)
+            .arg(&fill_blocks_i32)
+            .arg(&num_clusters)
+            .launch(scan_blocks_config)?;
+    }
+
+    let build_config = LaunchConfig {
+        grid_dim: (fill_blocks, 1, 1),
+        block_dim: (block_size, 1, 1),
+        shared_mem_bytes: ((num_clusters as u32) * block_size * 4) as u32,
+    };
+    unsafe {
+        stream
+            .launch_builder(&parallel_build_cluster_index)
+            .arg(&d_cluster_assignments)
+            .arg(&d_block_offsets)
+            .arg(&mut d_cluster_indices)
+            .arg(&database_size)
+            .arg(&num_clusters)
+            .launch(build_config)?;
+    }
+
+    let mut d_results = stream.alloc_zeros::<i32>(num_queries as usize)?;
+    
+    let search_config = if num_queries <= 3000 {
+        LaunchConfig {
+            grid_dim: (num_queries as u32, 1, 1),
+            block_dim: (1, 1, 1),
+            shared_mem_bytes: 0,
+        }
+    } else {
+        let threads_per_block = if vector_dims >= 720 { 
+            256 
+        } else { 
+            128 
+        };
+        let blocks = ((num_queries as u32) + threads_per_block - 1) / threads_per_block;
+        LaunchConfig {
+            grid_dim: (blocks.min(2048), 1, 1),
+            block_dim: (threads_per_block, 1, 1),
+            shared_mem_bytes: 0,
+        }
+    };
+
+    unsafe {
+        stream
+            .launch_builder(&cluster_search)
+            .arg(&challenge.d_query_vectors)
+            .arg(&challenge.d_database_vectors)
+            .arg(&d_cluster_centers)
+            .arg(&d_cluster_assignments)
+            .arg(&d_cluster_sizes)
+            .arg(&d_cluster_indices)
+            .arg(&d_cluster_offsets)
+            .arg(&mut d_results)
+            .arg(&num_queries)
+            .arg(&database_size)
+            .arg(&vector_dims)
+            .arg(&num_clusters)
+            .launch(search_config)?;
+    }
+    stream.synchronize()?;
+
+    let indices: Vec<i32> = stream.memcpy_dtov(&d_results)?;
+    let indexes = indices.iter().map(|&idx| idx as usize).collect();
+
+    let _ = save_solution(&Solution { indexes });
+    return Ok(());
+}
--- a/tig-algorithms/src/vector_search/mod.rs
+++ b/tig-algorithms/src/vector_search/mod.rs
@ -142,7 +142,8 @@

 // c004_a072

-// c004_a073
+pub mod cluster_improved;
+pub use cluster_improved as c004_a073;

 // c004_a074