Submitted vector_search/cluster_new

This commit is contained in:
FiveMovesAhead 2025-12-02 16:56:18 +00:00
parent bdc6ed6794
commit c2b39ee56f
4 changed files with 1107 additions and 1 deletions

View File

@ -0,0 +1,29 @@
# TIG Code Submission
## Submission Details
* **Challenge Name:** vector_search
* **Algorithm Name:** cluster_new
* **Copyright:** 2025 Rootz
* **Identity of Submitter:** Rootz
* **Identity of Creator of Algorithmic Method:** Rootz
* **Unique Algorithm Identifier (UAI):** null
## Additional Notes
Here I present my new vector_search algorithm.
Previous algorithms have hit difficulty cliffs at a certain number of queries. This algorithm solves much higher difficulties in almost the same runtime as it is much more fuel efficient.
## License
The files in this folder are under the following licenses:
* TIG Benchmarker Outbound License
* TIG Commercial License
* TIG Inbound Game License
* TIG Innovator Outbound Game License
* TIG Open Data License
* TIG THV Game License
Copies of the licenses can be obtained at:
https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses

View File

@ -0,0 +1,842 @@
#include <cuda_runtime.h>
#include <float.h>
#define MAX_FLOAT 3.402823466e+38F
__device__ __forceinline__ float euclidean_distance(const float* __restrict__ a, const float* __restrict__ b, int dims) {
float sum = 0.0f;
float c = 0.0f;
int i;
for (i = 0; i < dims - 15; i += 16) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
float s0 = d0*d0 + d1*d1 + d2*d2 + d3*d3;
float s1 = d4*d4 + d5*d5 + d6*d6 + d7*d7;
float s2 = d8*d8 + d9*d9 + d10*d10 + d11*d11;
float s3 = d12*d12 + d13*d13 + d14*d14 + d15*d15;
float partial = s0 + s1 + s2 + s3;
float y = partial - c;
float t = sum + y;
c = (t - sum) - y;
sum = t;
}
for (; i < dims - 7; i += 8) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
float v,y,t;
v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
}
for (; i < dims - 3; i += 4) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float v,y,t;
v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
}
for (; i < dims; i++) {
float diff = a[i] - b[i];
float squared = diff * diff;
float y = squared - c;
float t = sum + y;
c = (t - sum) - y;
sum = t;
}
return sum;
}
__device__ __forceinline__ float euclidean_distance_high(const float* __restrict__ a, const float* __restrict__ b, int dims) {
float sum = 0.0f;
float c = 0.0f;
int i;
for (i = 0; i < dims - 31; i += 32) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
float d16=a[i+16]-b[i+16], d17=a[i+17]-b[i+17], d18=a[i+18]-b[i+18], d19=a[i+19]-b[i+19];
float d20=a[i+20]-b[i+20], d21=a[i+21]-b[i+21], d22=a[i+22]-b[i+22], d23=a[i+23]-b[i+23];
float d24=a[i+24]-b[i+24], d25=a[i+25]-b[i+25], d26=a[i+26]-b[i+26], d27=a[i+27]-b[i+27];
float d28=a[i+28]-b[i+28], d29=a[i+29]-b[i+29], d30=a[i+30]-b[i+30], d31=a[i+31]-b[i+31];
float v,y,t;
v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d8*d8; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d9*d9; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d10*d10; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d11*d11; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d12*d12; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d13*d13; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d14*d14; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d15*d15; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d16*d16; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d17*d17; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d18*d18; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d19*d19; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d20*d20; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d21*d21; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d22*d22; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d23*d23; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d24*d24; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d25*d25; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d26*d26; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d27*d27; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d28*d28; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d29*d29; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d30*d30; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d31*d31; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
}
for (; i < dims - 15; i += 16) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
float v,y,t;
v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d8*d8; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d9*d9; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d10*d10; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d11*d11; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d12*d12; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d13*d13; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d14*d14; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d15*d15; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
}
for (; i < dims - 7; i += 8) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
float v,y,t;
v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
}
for (; i < dims - 3; i += 4) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float v,y,t;
v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
}
for (; i < dims; i++) {
float diff = a[i] - b[i];
float squared = diff * diff;
float y = squared - c;
float t = sum + y;
c = (t - sum) - y;
sum = t;
}
return sum;
}
__device__ __forceinline__ float euclidean_distance_bounded(const float* __restrict__ a, const float* __restrict__ b, int dims, float limit) {
float margin = fmaxf(1e-6f, 1.0e-4f * (1.0f + limit));
if (limit > 25.0f) {
float sum = 0.0f;
int i = 0;
if (dims >= 8) {
float d0=a[0]-b[0], d1=a[1]-b[1], d2=a[2]-b[2], d3=a[3]-b[3];
float d4=a[4]-b[4], d5=a[5]-b[5], d6=a[6]-b[6], d7=a[7]-b[7];
sum = d0*d0+d1*d1+d2*d2+d3*d3+d4*d4+d5*d5+d6*d6+d7*d7;
if (sum > limit + margin) return sum;
i = 8;
}
for (; i < dims - 15; i += 16) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
sum += d0*d0+d1*d1+d2*d2+d3*d3+d4*d4+d5*d5+d6*d6+d7*d7;
if (sum > limit + margin) return sum;
float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
sum += d8*d8+d9*d9+d10*d10+d11*d11+d12*d12+d13*d13+d14*d14+d15*d15;
if (sum > limit + margin) return sum;
}
for (; i < dims - 7; i += 8) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
sum += d0*d0+d1*d1+d2*d2+d3*d3+d4*d4+d5*d5+d6*d6+d7*d7;
if (sum > limit + margin) return sum;
}
for (; i < dims - 3; i += 4) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
sum += d0*d0+d1*d1+d2*d2+d3*d3;
if (sum > limit + margin) return sum;
}
for (; i < dims; i++) {
float d=a[i]-b[i];
sum += d*d;
if (sum > limit + margin) return sum;
}
return sum;
}
float sum = 0.0f;
float c = 0.0f;
int i = 0;
if (dims >= 4) {
float d0=a[0]-b[0], d1=a[1]-b[1], d2=a[2]-b[2], d3=a[3]-b[3];
float s0=d0*d0+d1*d1+d2*d2+d3*d3;
float y=s0-c;
float t=sum+y;
c=(t-sum)-y;
sum=t;
if (sum > limit + margin) return sum;
i = 4;
}
for (; i < dims - 15; i += 16) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float s0=d0*d0+d1*d1+d2*d2+d3*d3;
float y=s0-c;
float t=sum+y;
c=(t-sum)-y;
sum=t;
if (sum > limit + margin) return sum;
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
float s1=d4*d4+d5*d5+d6*d6+d7*d7;
y=s1-c;
t=sum+y;
c=(t-sum)-y;
sum=t;
if (sum > limit + margin) return sum;
float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
float s2=d8*d8+d9*d9+d10*d10+d11*d11;
y=s2-c;
t=sum+y;
c=(t-sum)-y;
sum=t;
if (sum > limit + margin) return sum;
float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
float s3=d12*d12+d13*d13+d14*d14+d15*d15;
y=s3-c;
t=sum+y;
c=(t-sum)-y;
sum=t;
if (sum > limit + margin) return sum;
}
for (; i < dims - 7; i += 8) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
float v,y,t;
v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d4*d4; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d5*d5; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d6*d6; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d7*d7; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
if (sum > limit + margin) return sum;
}
for (; i < dims - 3; i += 4) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float v,y,t;
v=d0*d0; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d1*d1; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d2*d2; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
v=d3*d3; y=v-c; t=sum+y; c=(t-sum)-y; sum=t;
if (sum > limit + margin) return sum;
}
for (; i < dims; i++) {
float diff=a[i]-b[i];
float squared=diff*diff;
float y=squared-c;
float t=sum+y;
c=(t-sum)-y;
sum=t;
if (sum > limit + margin) return sum;
}
return sum;
}
__device__ __forceinline__ float euclidean_distance_high_bounded(const float* __restrict__ a, const float* __restrict__ b, int dims, float limit) {
float sum=0.0f;
float margin = fmaxf(1e-6f, 1.0e-4f * (1.0f + limit));
int i = 0;
if (dims >= 4) {
float d0=a[0]-b[0], d1=a[1]-b[1], d2=a[2]-b[2], d3=a[3]-b[3];
sum = d0*d0+d1*d1+d2*d2+d3*d3;
if (sum > limit + margin) return sum;
i = 4;
}
for (;i<dims-31;i+=32){
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
sum += d0*d0+d1*d1+d2*d2+d3*d3+d4*d4+d5*d5+d6*d6+d7*d7+d8*d8+d9*d9+d10*d10+d11*d11+d12*d12+d13*d13+d14*d14+d15*d15;
if (sum > limit + margin) return sum;
float d16=a[i+16]-b[i+16], d17=a[i+17]-b[i+17], d18=a[i+18]-b[i+18], d19=a[i+19]-b[i+19];
float d20=a[i+20]-b[i+20], d21=a[i+21]-b[i+21], d22=a[i+22]-b[i+22], d23=a[i+23]-b[i+23];
float d24=a[i+24]-b[i+24], d25=a[i+25]-b[i+25], d26=a[i+26]-b[i+26], d27=a[i+27]-b[i+27];
float d28=a[i+28]-b[i+28], d29=a[i+29]-b[i+29], d30=a[i+30]-b[i+30], d31=a[i+31]-b[i+31];
sum += d16*d16+d17*d17+d18*d18+d19*d19+d20*d20+d21*d21+d22*d22+d23*d23+d24*d24+d25*d25+d26*d26+d27*d27+d28*d28+d29*d29+d30*d30+d31*d31;
if (sum > limit + margin) return sum;
}
for (; i < dims - 15; i += 16) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
sum += d0*d0+d1*d1+d2*d2+d3*d3+d4*d4+d5*d5+d6*d6+d7*d7+d8*d8+d9*d9+d10*d10+d11*d11+d12*d12+d13*d13+d14*d14+d15*d15;
if (sum > limit + margin) return sum;
}
for (; i < dims - 7; i += 8) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
sum += d0*d0+d1*d1+d2*d2+d3*d3+d4*d4+d5*d5+d6*d6+d7*d7;
if (sum > limit + margin) return sum;
}
for (; i < dims - 3; i += 4) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
sum += d0*d0+d1*d1+d2*d2+d3*d3;
if (sum > limit + margin) return sum;
}
for (; i < dims; i++) {
float diff=a[i]-b[i];
sum += diff*diff;
if (sum > limit + margin) return sum;
}
return sum;
}
__device__ __forceinline__ float euclidean_distance_precise_bounded(const float* __restrict__ a, const float* __restrict__ b, int dims, float limit) {
double acc = 0.0;
double lim = (double)limit;
for (int i = 0; i < dims; i++) {
double d = (double)a[i] - (double)b[i];
acc += d * d;
if (acc > lim) return (float)acc;
}
return (float)acc;
}
extern "C" __global__ void deterministic_clustering(
const float* __restrict__ database_vectors,
float* __restrict__ cluster_centers,
int* __restrict__ cluster_assignments,
int* __restrict__ cluster_sizes,
int database_size,
int vector_dims,
int num_clusters,
int num_queries
) {
int cluster_idx = blockIdx.x;
int tid = threadIdx.x;
if (cluster_idx >= num_clusters) return;
long long seed_idx = ((long long)cluster_idx * 982451653LL + 1566083941LL) % (long long)database_size;
int stride = max(1, database_size / (num_clusters * 37));
long long start_idx = seed_idx;
for (int d = tid; d < vector_dims; d += blockDim.x) {
float acc = 0.0f;
long long idx = start_idx;
#pragma unroll
for (int k = 0; k < 4; ++k) {
int pos = (int)(idx % (long long)database_size);
acc += database_vectors[pos * vector_dims + d];
idx += stride;
}
cluster_centers[cluster_idx * vector_dims + d] = acc * 0.25f;
}
if (tid == 0) {
cluster_sizes[cluster_idx] = 0;
}
}
__device__ __forceinline__ float euclidean_distance_fast(const float* __restrict__ a, const float* __restrict__ b, int dims) {
float sum = 0.0f;
int i;
for (i = 0; i < dims - 15; i += 16) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
float d8=a[i+8]-b[i+8], d9=a[i+9]-b[i+9], d10=a[i+10]-b[i+10], d11=a[i+11]-b[i+11];
float d12=a[i+12]-b[i+12], d13=a[i+13]-b[i+13], d14=a[i+14]-b[i+14], d15=a[i+15]-b[i+15];
sum += d0*d0+d1*d1+d2*d2+d3*d3+d4*d4+d5*d5+d6*d6+d7*d7+d8*d8+d9*d9+d10*d10+d11*d11+d12*d12+d13*d13+d14*d14+d15*d15;
}
for (; i < dims - 7; i += 8) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
float d4=a[i+4]-b[i+4], d5=a[i+5]-b[i+5], d6=a[i+6]-b[i+6], d7=a[i+7]-b[i+7];
sum += d0*d0+d1*d1+d2*d2+d3*d3+d4*d4+d5*d5+d6*d6+d7*d7;
}
for (; i < dims - 3; i += 4) {
float d0=a[i]-b[i], d1=a[i+1]-b[i+1], d2=a[i+2]-b[i+2], d3=a[i+3]-b[i+3];
sum += d0*d0+d1*d1+d2*d2+d3*d3;
}
for (; i < dims; i++) {
float d=a[i]-b[i];
sum += d*d;
}
return sum;
}
extern "C" __global__ void assign_clusters(
const float* __restrict__ database_vectors,
const float* __restrict__ cluster_centers,
int* __restrict__ cluster_assignments,
int* __restrict__ cluster_sizes,
int database_size,
int vector_dims,
int num_clusters,
int num_queries
) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
if (thread_id < database_size) {
int vec_idx = thread_id;
const float* vector = database_vectors + vec_idx * vector_dims;
float min_dist = MAX_FLOAT;
int best_cluster = 0;
for (int c = 0; c < num_clusters; c++) {
const float* c_center = cluster_centers + c * vector_dims;
float dist = euclidean_distance_fast(vector, c_center, vector_dims);
if (dist < min_dist) {
min_dist = dist;
best_cluster = c;
}
}
cluster_assignments[vec_idx] = best_cluster;
}
}
extern "C" __global__ void exclusive_scan_sizes(
const int* cluster_sizes,
int* cluster_offsets,
int* write_offsets,
int num_clusters
) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
int acc = 0;
for (int i = 0; i < num_clusters; i++) {
cluster_offsets[i] = acc;
write_offsets[i] = acc;
acc += cluster_sizes[i];
}
}
}
extern "C" __global__ void build_cluster_index(
const int* cluster_assignments,
int* write_offsets,
int* cluster_indices,
int database_size
) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (int vec_idx = 0; vec_idx < database_size; vec_idx++) {
int cluster = cluster_assignments[vec_idx];
int pos = write_offsets[cluster];
cluster_indices[pos] = vec_idx;
write_offsets[cluster]++;
}
}
}
extern "C" __global__ void count_block_cluster_sizes(
const int* cluster_assignments,
int* block_counts,
int database_size,
int num_clusters
) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int block = blockIdx.x;
int base = block * blockDim.x;
int vec_idx = base + tid;
__shared__ int s_len;
if (tid == 0) {
int rem = database_size - base;
s_len = rem > blockDim.x ? blockDim.x : (rem > 0 ? rem : 0);
}
__syncthreads();
if (s_len == 0) {
if (tid == 0) {
for (int c = 0; c < num_clusters; c++) {
block_counts[block * num_clusters + c] = 0;
}
}
return;
}
int cid = -1;
if (tid < s_len) cid = cluster_assignments[vec_idx];
for (int c = 0; c < num_clusters; c++) {
int* buf = sdata + c * blockDim.x;
if (tid < s_len) {
buf[tid] = (cid == c) ? 1 : 0;
} else if (tid < blockDim.x) {
buf[tid] = 0;
}
}
__syncthreads();
for (int stride = blockDim.x >> 1; stride > 0; stride >>= 1) {
if (tid < stride) {
int limit = (tid + stride < s_len) ? 1 : 0;
if (limit) {
for (int c = 0; c < num_clusters; c++) {
int* buf = sdata + c * blockDim.x;
buf[tid] += buf[tid + stride];
}
}
}
__syncthreads();
}
if (tid == 0) {
for (int c = 0; c < num_clusters; c++) {
int* buf = sdata + c * blockDim.x;
block_counts[block * num_clusters + c] = buf[0];
}
}
}
extern "C" __global__ void exclusive_scan_block_counts(
const int* cluster_offsets,
const int* block_counts,
int* block_offsets,
int num_blocks,
int num_clusters
) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (int c = 0; c < num_clusters; c++) {
int acc = cluster_offsets[c];
for (int b = 0; b < num_blocks; b++) {
block_offsets[b * num_clusters + c] = acc;
acc += block_counts[b * num_clusters + c];
}
}
}
}
extern "C" __global__ void reduce_block_counts(
const int* block_counts,
int* cluster_sizes,
int num_blocks,
int num_clusters
) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (int c = 0; c < num_clusters; c++) {
int acc = 0;
for (int b = 0; b < num_blocks; b++) {
acc += block_counts[b * num_clusters + c];
}
cluster_sizes[c] = acc;
}
}
}
extern "C" __global__ void parallel_build_cluster_index(
const int* cluster_assignments,
const int* block_offsets,
int* cluster_indices,
int database_size,
int num_clusters
) {
extern __shared__ int sdata[];
int tid = threadIdx.x;
int block = blockIdx.x;
int base = block * blockDim.x;
int vec_idx = base + tid;
__shared__ int s_len;
if (tid == 0) {
int rem = database_size - base;
s_len = rem > blockDim.x ? blockDim.x : (rem > 0 ? rem : 0);
}
__syncthreads();
if (s_len == 0) return;
int cid = -1;
if (tid < s_len) cid = cluster_assignments[vec_idx];
for (int c = 0; c < num_clusters; c++) {
int* flags = sdata + c * blockDim.x;
if (tid < s_len) flags[tid] = (cid == c) ? 1 : 0;
else if (tid < blockDim.x) flags[tid] = 0;
}
__syncthreads();
for (int c = 0; c < num_clusters; c++) {
int* flags = sdata + c * blockDim.x;
for (int offset = 1; offset < s_len; offset <<= 1) {
int v = 0;
if (tid >= offset && tid < s_len) v = flags[tid - offset];
__syncthreads();
if (tid < s_len) flags[tid] += v;
__syncthreads();
}
if (tid < s_len && cid == c) {
int local_rank = flags[tid] - 1;
int base_off = block_offsets[block * num_clusters + c];
cluster_indices[base_off + local_rank] = vec_idx;
}
__syncthreads();
}
}
extern "C" __global__ void cluster_search(
const float* __restrict__ query_vectors,
const float* __restrict__ database_vectors,
const float* __restrict__ cluster_centers,
const int* __restrict__ cluster_assignments,
const int* __restrict__ cluster_sizes,
const int* __restrict__ cluster_indices,
const int* __restrict__ cluster_offsets,
int* __restrict__ results,
int num_queries,
int database_size,
int vector_dims,
int num_clusters
) {
if (num_queries <= 3000) {
int query_idx = blockIdx.x;
if (query_idx >= num_queries) return;
const float* query = query_vectors + query_idx * vector_dims;
float cluster_dists[16];
int cluster_order[16];
for (int cluster = 0; cluster < num_clusters; cluster++) {
const float* center = cluster_centers + cluster * vector_dims;
cluster_dists[cluster] = euclidean_distance(query, center, vector_dims);
cluster_order[cluster] = cluster;
}
int clusters_to_search = (num_queries <= 1000) ? num_clusters :
(num_queries <= 2000) ? min(num_clusters, (num_clusters * 3) / 4) :
(num_queries <= 2800) ? min(num_clusters, (num_clusters * 2) / 3) :
min(num_clusters, max(2, num_clusters / 2));
if (vector_dims >= 700) {
int target = max(3, clusters_to_search);
clusters_to_search = min(num_clusters, target);
}
for (int i = 0; i < clusters_to_search; i++) {
int best = i;
for (int j = i + 1; j < num_clusters; j++) {
if (cluster_dists[cluster_order[j]] < cluster_dists[cluster_order[best]]) {
best = j;
}
}
int temp = cluster_order[i];
cluster_order[i] = cluster_order[best];
cluster_order[best] = temp;
}
float min_dist = MAX_FLOAT;
int best_idx = -1;
for (int c_idx = 0; c_idx < clusters_to_search; c_idx++) {
int target_cluster = cluster_order[c_idx];
if (cluster_sizes[target_cluster] <= 0) continue;
int start = cluster_offsets[target_cluster];
int end = start + cluster_sizes[target_cluster];
for (int p = start; p < end; p++) {
int vec_idx = cluster_indices[p];
const float* db_vector = database_vectors + vec_idx * vector_dims;
float dist = euclidean_distance_bounded(query, db_vector, vector_dims, min_dist);
if (dist < min_dist) {
min_dist = dist;
best_idx = vec_idx;
} else if (vector_dims >= 700 && best_idx != -1 && dist <= min_dist + 0.0015f) {
float d2 = euclidean_distance_precise_bounded(query, db_vector, vector_dims, min_dist);
if (d2 < min_dist) {
min_dist = d2;
best_idx = vec_idx;
}
}
}
}
if (min_dist == MAX_FLOAT) {
int base_stride = max(6, database_size / 1900);
int max_checks = min(database_size / base_stride, 1900);
for (int phase = 0; phase < 2; phase++) {
int offset = phase * (base_stride / 2);
for (int i = 0; i < max_checks / 2; i++) {
int db_idx = (offset + i * base_stride) % database_size;
const float* db_vector = database_vectors + db_idx * vector_dims;
float dist = euclidean_distance_bounded(query, db_vector, vector_dims, min_dist);
if (dist < min_dist) {
min_dist = dist;
best_idx = db_idx;
}
}
}
if (best_idx != -1) {
int radius = min(25, base_stride);
int start_local = max(0, best_idx - radius);
int end_local = min(database_size, best_idx + radius + 1);
for (int i = start_local; i < end_local; i++) {
if (i == best_idx) continue;
const float* db_vector = database_vectors + i * vector_dims;
float dist = euclidean_distance_bounded(query, db_vector, vector_dims, min_dist);
if (dist < min_dist) {
min_dist = dist;
best_idx = i;
}
}
}
}
if (min_dist == MAX_FLOAT) {
best_idx = 0;
}
results[query_idx] = best_idx;
} else {
int query_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (query_idx >= num_queries) return;
const float* query = query_vectors + query_idx * vector_dims;
float cluster_dists[16];
int cluster_order[16];
for (int cluster = 0; cluster < num_clusters; cluster++) {
const float* center = cluster_centers + cluster * vector_dims;
cluster_dists[cluster] = euclidean_distance_high(query, center, vector_dims);
cluster_order[cluster] = cluster;
}
int clusters_to_search = (num_queries <= 3500) ? min(num_clusters, 5) :
(num_queries <= 6000) ? min(num_clusters, 4) :
(num_queries <= 8000) ? min(num_clusters, 3) :
2;
if (num_queries <= 5000 && vector_dims >= 720) {
clusters_to_search = num_clusters;
} else if (vector_dims >= 720) {
clusters_to_search = (num_clusters * 3) / 4;
} else if (vector_dims >= 700) {
clusters_to_search = max(clusters_to_search, min(num_clusters, (num_clusters * 3) / 4 + 1));
}
for (int i = 0; i < clusters_to_search; i++) {
int best = i;
for (int j = i + 1; j < num_clusters; j++) {
if (cluster_dists[cluster_order[j]] < cluster_dists[cluster_order[best]]) {
best = j;
}
}
int temp = cluster_order[i];
cluster_order[i] = cluster_order[best];
cluster_order[best] = temp;
}
float min_dist = MAX_FLOAT;
int best_idx = -1;
for (int c_idx = 0; c_idx < clusters_to_search; c_idx++) {
int target_cluster = cluster_order[c_idx];
if (cluster_sizes[target_cluster] <= 0) continue;
int start = cluster_offsets[target_cluster];
int end = start + cluster_sizes[target_cluster];
for (int p = start; p < end; p++) {
int vec_idx = cluster_indices[p];
const float* db_vector = database_vectors + vec_idx * vector_dims;
float dist = euclidean_distance_high_bounded(query, db_vector, vector_dims, min_dist);
if (vector_dims >= 700 && best_idx != -1 && dist <= min_dist + 0.0015f) {
float d2 = euclidean_distance_precise_bounded(query, db_vector, vector_dims, min_dist);
if (d2 < dist) dist = d2;
}
if (dist < min_dist) {
min_dist = dist;
best_idx = vec_idx;
}
}
}
if (min_dist == MAX_FLOAT) {
int base_stride = (vector_dims >= 720) ? max(8, database_size / 850) : max(10, database_size / 1150);
int max_checks = min(database_size / base_stride, (vector_dims >= 720) ? 1500 : 1150);
for (int phase = 0; phase < 2; phase++) {
int offset = phase * (base_stride / 3);
int phase_checks = max_checks / 2;
for (int i = 0; i < phase_checks; i++) {
int db_idx = (offset + i * base_stride) % database_size;
const float* db_vector = database_vectors + db_idx * vector_dims;
float dist = euclidean_distance_high_bounded(query, db_vector, vector_dims, min_dist);
if (dist < min_dist) {
min_dist = dist;
best_idx = db_idx;
}
}
}
if (best_idx != -1) {
int radius = (vector_dims >= 720) ? min(32, (base_stride * 2) / 3) : min(18, base_stride / 2);
int start_local = max(0, best_idx - radius);
int end_local = min(database_size, best_idx + radius + 1);
for (int i = start_local; i < end_local; i++) {
if (i == best_idx) continue;
const float* db_vector = database_vectors + i * vector_dims;
float dist = euclidean_distance_high_bounded(query, db_vector, vector_dims, min_dist);
if (dist < min_dist) {
min_dist = dist;
best_idx = i;
}
}
}
}
if (min_dist == MAX_FLOAT) {
best_idx = 0;
}
results[query_idx] = best_idx;
}
}

View File

@ -0,0 +1,234 @@
use cudarc::{
driver::{safe::LaunchConfig, CudaModule, CudaStream, PushKernelArg},
runtime::sys::cudaDeviceProp,
};
use std::sync::Arc;
use serde_json::{Map, Value};
use tig_challenges::vector_search::*;
pub fn solve_challenge(
challenge: &Challenge,
save_solution: &dyn Fn(&Solution) -> anyhow::Result<()>,
hyperparameters: &Option<Map<String, Value>>,
module: Arc<CudaModule>,
stream: Arc<CudaStream>,
_prop: &cudaDeviceProp,
) -> anyhow::Result<()> {
let vector_dims = challenge.vector_dims as i32;
let database_size = challenge.database_size as i32;
let num_queries = challenge.num_queries as i32;
let block_size: u32 = 128;
fn calculate_optimal_clusters(num_queries: i32, database_size: i32, vector_dims: i32) -> i32 {
let base_clusters = if num_queries <= 3000 {
3
} else if num_queries <= 5000 {
if vector_dims >= 720 { 5 } else { 4 }
} else if num_queries <= 7000 {
4
} else if num_queries <= 9000 {
5
} else if num_queries <= 11000 {
6
} else if num_queries <= 15000 {
7
} else if num_queries <= 20000 {
8
} else {
((database_size as f32).sqrt() / 1000.0).max(7.0).min(10.0) as i32
};
let memory_factor = if vector_dims > 1000 { 0.8 } else { 1.0 };
((base_clusters as f32 * memory_factor) as i32).max(3).min(10)
}
let num_clusters = calculate_optimal_clusters(num_queries, database_size, vector_dims);
let deterministic_clustering = module.load_function("deterministic_clustering")?;
let assign_clusters = module.load_function("assign_clusters")?;
let cluster_search = module.load_function("cluster_search")?;
let exclusive_scan_sizes = module.load_function("exclusive_scan_sizes")?;
let count_block_cluster_sizes = module.load_function("count_block_cluster_sizes")?;
let reduce_block_counts = module.load_function("reduce_block_counts")?;
let exclusive_scan_block_counts = module.load_function("exclusive_scan_block_counts")?;
let parallel_build_cluster_index = module.load_function("parallel_build_cluster_index")?;
let mut d_cluster_centers = stream.alloc_zeros::<f32>((num_clusters * vector_dims) as usize)?;
let mut d_cluster_assignments = stream.alloc_zeros::<i32>(database_size as usize)?;
let mut d_cluster_sizes = stream.alloc_zeros::<i32>(num_clusters as usize)?;
let cluster_config = LaunchConfig {
grid_dim: (num_clusters as u32, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
};
unsafe {
stream
.launch_builder(&deterministic_clustering)
.arg(&challenge.d_database_vectors)
.arg(&mut d_cluster_centers)
.arg(&mut d_cluster_assignments)
.arg(&mut d_cluster_sizes)
.arg(&database_size)
.arg(&vector_dims)
.arg(&num_clusters)
.arg(&num_queries)
.launch(cluster_config)?;
}
let assign_threads: u32 = 256;
let assign_blocks: u32 = ((database_size as u32) + assign_threads - 1) / assign_threads;
let assign_config = LaunchConfig {
grid_dim: (assign_blocks, 1, 1),
block_dim: (assign_threads, 1, 1),
shared_mem_bytes: 0,
};
unsafe {
stream
.launch_builder(&assign_clusters)
.arg(&challenge.d_database_vectors)
.arg(&d_cluster_centers)
.arg(&mut d_cluster_assignments)
.arg(&mut d_cluster_sizes)
.arg(&database_size)
.arg(&vector_dims)
.arg(&num_clusters)
.arg(&num_queries)
.launch(assign_config)?;
}
let mut d_cluster_offsets = stream.alloc_zeros::<i32>(num_clusters as usize)?;
let mut d_write_offsets = stream.alloc_zeros::<i32>(num_clusters as usize)?;
let scan_config = LaunchConfig {
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
{}
let mut d_cluster_indices = stream.alloc_zeros::<i32>(database_size as usize)?;
let db_u32 = database_size as u32;
let fill_blocks: u32 = (db_u32 + block_size - 1) / block_size;
let fill_blocks_i32: i32 = fill_blocks as i32;
let mut d_block_counts = stream.alloc_zeros::<i32>((fill_blocks as usize) * (num_clusters as usize))?;
let mut d_block_offsets = stream.alloc_zeros::<i32>((fill_blocks as usize) * (num_clusters as usize))?;
let count_config = LaunchConfig {
grid_dim: (fill_blocks, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: ((num_clusters as u32) * block_size * 4) as u32,
};
unsafe {
stream
.launch_builder(&count_block_cluster_sizes)
.arg(&d_cluster_assignments)
.arg(&mut d_block_counts)
.arg(&database_size)
.arg(&num_clusters)
.launch(count_config)?;
}
let scan_blocks_config = LaunchConfig {
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
unsafe {
stream
.launch_builder(&reduce_block_counts)
.arg(&d_block_counts)
.arg(&mut d_cluster_sizes)
.arg(&fill_blocks_i32)
.arg(&num_clusters)
.launch(scan_blocks_config)?;
}
unsafe {
stream
.launch_builder(&exclusive_scan_sizes)
.arg(&d_cluster_sizes)
.arg(&mut d_cluster_offsets)
.arg(&mut d_write_offsets)
.arg(&num_clusters)
.launch(scan_config)?;
}
unsafe {
stream
.launch_builder(&exclusive_scan_block_counts)
.arg(&d_cluster_offsets)
.arg(&d_block_counts)
.arg(&mut d_block_offsets)
.arg(&fill_blocks_i32)
.arg(&num_clusters)
.launch(scan_blocks_config)?;
}
let build_config = LaunchConfig {
grid_dim: (fill_blocks, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: ((num_clusters as u32) * block_size * 4) as u32,
};
unsafe {
stream
.launch_builder(&parallel_build_cluster_index)
.arg(&d_cluster_assignments)
.arg(&d_block_offsets)
.arg(&mut d_cluster_indices)
.arg(&database_size)
.arg(&num_clusters)
.launch(build_config)?;
}
let mut d_results = stream.alloc_zeros::<i32>(num_queries as usize)?;
let search_config = if num_queries <= 3000 {
LaunchConfig {
grid_dim: (num_queries as u32, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
}
} else {
let threads_per_block = if vector_dims >= 720 {
256
} else {
128
};
let blocks = ((num_queries as u32) + threads_per_block - 1) / threads_per_block;
LaunchConfig {
grid_dim: (blocks.min(2048), 1, 1),
block_dim: (threads_per_block, 1, 1),
shared_mem_bytes: 0,
}
};
unsafe {
stream
.launch_builder(&cluster_search)
.arg(&challenge.d_query_vectors)
.arg(&challenge.d_database_vectors)
.arg(&d_cluster_centers)
.arg(&d_cluster_assignments)
.arg(&d_cluster_sizes)
.arg(&d_cluster_indices)
.arg(&d_cluster_offsets)
.arg(&mut d_results)
.arg(&num_queries)
.arg(&database_size)
.arg(&vector_dims)
.arg(&num_clusters)
.launch(search_config)?;
}
stream.synchronize()?;
let indices: Vec<i32> = stream.memcpy_dtov(&d_results)?;
let indexes = indices.iter().map(|&idx| idx as usize).collect();
let _ = save_solution(&Solution { indexes });
return Ok(());
}
pub fn help() {
println!("No help information available.");
}

View File

@ -148,7 +148,8 @@
// c004_a075
// c004_a076
pub mod cluster_new;
pub use cluster_new as c004_a076;
// c004_a077