Submitted vector_search/autovector_native_v2

This commit is contained in:
FiveMovesAhead 2025-12-03 16:49:36 +00:00
parent bdc6ed6794
commit 9cd631cbd9
4 changed files with 619 additions and 1 deletions

View File

@ -0,0 +1,61 @@
# AutoVector Native v2.0-beta
**High-Performance Vector Range Search for the TIG Challenge**
---
## Submission Details
* **Challenge Name:** vector_search
* **Algorithm Name:** autovector_native_v2
* **Copyright:** 2025 Brent Beane
* **Identity of Submitter:** Brent Beane
* **Identity of Creator of Algorithmic Method:** null
* **Unique Algorithm Identifier (UAI):** null
## License
The files in this folder are under the following licenses:
* TIG Benchmarker Outbound License
* TIG Commercial License
* TIG Inbound Game License
* TIG Innovator Outbound Game License
* TIG Open Data License
* TIG THV Game License
Copies of the licenses can be obtained at:
https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses
## Overview
AutoVector Native is a GPU-accelerated vector range search algorithm engineered for **maximum throughput**, **deterministic execution**, and **resilience under fuel-limited environments**. Designed specifically for the TIG Vector Search Challenge, it combines **adaptive clustering**, **coalesced memory access**, and **batched solution persistence** to deliver state-of-the-art performance without sacrificing compliance.
Unlike conventional approaches that rely on complex synchronization or approximate filtering, AutoVector Native uses a **radical simplification strategy**: fixed small cluster counts, exhaustive intra-cluster search, and minimal kernel overhead — enabling predictable, scalable performance across difficulty levels.
This version (`v2.0-beta`) introduces Sigma I & II compliance, dynamic probe scheduling, and hybrid CPU-GPU coordination for robust operation under real contest constraints.
---
## Key Features
| Feature | Description |
|-------|-------------|
| ✅ **Sigma I Compliant** | No oracle access; only uses permitted challenge fields |
| ✅ **Sigma II Compliant** | Saves partial solutions after every query batch (256 queries) |
| 🔁 **Adaptive Clustering** | Dynamically computes optimal centroid count based on database size, dimensionality, and query load |
| 🧠 **Multi-Probe Search** | Supports both exhaustive and adaptive probe strategies via hyperparameters |
| ⚙️ **GPU-Optimized Kernels** | All core operations (centroid selection, assignment, indexing, search) executed on GPU |
| 💾 **Progress Preservation** | Batched `save_solution()` ensures partial results survive fuel exhaustion |
| 🚀 **High Occupancy** | Uses 256-thread blocks, coalesced memory access, and shared memory where safe |
---
## Algorithm Design
### 1. **Centroid Initialization**
- Centroids selected via strided sampling from database vectors using `select_centroids_strided`.
- Number of centroids adaptively scaled:
```rust
base = √N
dim_factor = 1.0 + ln(dim / 100)
query_factor = √(num_queries / 10)
adaptive_count = (base × dim_factor / query_factor).clamp(8, 64)

View File

@ -0,0 +1,276 @@
use anyhow::Result;
use cudarc::driver::{safe::{LaunchConfig, CudaModule, CudaStream}, CudaSlice, PushKernelArg};
use cudarc::runtime::sys::cudaDeviceProp;
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value};
use std::sync::Arc;
use tig_challenges::vector_search::*;
#[derive(Serialize, Deserialize, Clone)]
pub struct Hyperparameters {
pub num_centroids: u32,
pub search_clusters: u32,
pub multi_probe_boost: u32,
}
impl Default for Hyperparameters {
fn default() -> Self {
Self {
num_centroids: 4,
search_clusters: 2,
multi_probe_boost: 2,
}
}
}
fn calculate_adaptive_clusters(database_size: u32, vector_dims: u32, num_queries: u32) -> u32 {
let base_clusters = (database_size as f32).sqrt() as u32;
let dim_factor = 1.0 + (vector_dims as f32 / 100.0).ln();
let query_factor = (num_queries as f32 / 10.0).sqrt().max(1.0);
let adaptive_count = (base_clusters as f32 * dim_factor / query_factor) as u32;
adaptive_count.clamp(8, 64)
}
fn l2_squared(a: &[f32], b: &[f32]) -> f32 {
a.iter().zip(b.iter()).map(|(x, y)| (x - y).powi(2)).sum()
}
fn build_probe_list_adaptive(
query: &[f32],
centroids: &[f32],
num_centroids: u32,
dim: u32,
total_probes: u32,
) -> Vec<u32> {
let mut distances: Vec<(u32, f32)> = (0..num_centroids)
.map(|i| {
let start = (i * dim) as usize;
let end = start + dim as usize;
let centroid = &centroids[start..end];
let dist = l2_squared(query, centroid);
(i, dist)
})
.collect();
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
distances.into_iter()
.take(total_probes.min(num_centroids) as usize)
.map(|(i, _)| i)
.collect()
}
pub fn solve_challenge(
challenge: &Challenge,
save_solution: &dyn Fn(&Solution) -> Result<()>,
hyperparameters: &Option<Map<String, Value>>,
module: Arc<CudaModule>,
stream: Arc<CudaStream>,
prop: &cudaDeviceProp,
) -> Result<()> {
let hps: Hyperparameters = hyperparameters.as_ref()
.and_then(|m| serde_json::from_value(serde_json::Value::Object(m.clone())).ok())
.unwrap_or_default();
let dim = challenge.vector_dims as u32;
let num_vectors = challenge.database_size as u32;
let num_queries = challenge.num_queries as u32;
let adaptive_centroids = calculate_adaptive_clusters(num_vectors, dim, num_queries);
let num_centroids = hps.num_centroids.min(adaptive_centroids);
let total_search_clusters = (hps.search_clusters + hps.multi_probe_boost).min(num_centroids);
let use_adaptive_probes = total_search_clusters < num_centroids;
// Select centroids on GPU
let d_centroids = stream.alloc_zeros::<f32>((num_centroids * dim) as usize)?;
let select_func = module.load_function("select_centroids_strided")?;
let total_threads = num_centroids * dim;
unsafe {
stream.launch_builder(&select_func)
.arg(&challenge.d_database_vectors)
.arg(&d_centroids)
.arg(&num_vectors)
.arg(&num_centroids)
.arg(&dim)
.launch(LaunchConfig {
grid_dim: ((total_threads + 255) / 256, 1, 1),
block_dim: (256, 1, 1),
shared_mem_bytes: 0
})?;
}
stream.synchronize()?;
let centroids = if use_adaptive_probes {
stream.memcpy_dtov(&d_centroids)?
} else {
Vec::new()
};
// Assign vectors to nearest centroids
let d_assignments = stream.alloc_zeros::<i32>(num_vectors as usize)?;
let assign_func = module.load_function("assign_to_nearest_centroid")?;
unsafe {
stream.launch_builder(&assign_func)
.arg(&challenge.d_database_vectors)
.arg(&d_centroids)
.arg(&d_assignments)
.arg(&num_vectors)
.arg(&num_centroids)
.arg(&dim)
.launch(LaunchConfig {
grid_dim: ((num_vectors + 255) / 256, 1, 1),
block_dim: (256, 1, 1),
shared_mem_bytes: 0
})?;
}
stream.synchronize()?;
// GPU-based cluster building
let d_cluster_sizes = stream.alloc_zeros::<i32>(num_centroids as usize)?;
let count_func = module.load_function("count_cluster_sizes")?;
unsafe {
stream.launch_builder(&count_func)
.arg(&d_assignments)
.arg(&d_cluster_sizes)
.arg(&num_vectors)
.launch(LaunchConfig {
grid_dim: ((num_vectors + 255) / 256, 1, 1),
block_dim: (256, 1, 1),
shared_mem_bytes: 0
})?;
}
stream.synchronize()?;
let cluster_sizes: Vec<i32> = stream.memcpy_dtov(&d_cluster_sizes)?;
let mut cluster_offsets: Vec<i32> = vec![0];
let mut total = 0i32;
for &size in &cluster_sizes {
total += size;
cluster_offsets.push(total);
}
let d_cluster_indices = stream.alloc_zeros::<i32>(num_vectors as usize)?;
let d_cluster_offsets = stream.memcpy_stod(&cluster_offsets)?;
let d_cluster_positions = stream.alloc_zeros::<i32>(num_centroids as usize)?;
let build_func = module.load_function("build_cluster_indices")?;
unsafe {
stream.launch_builder(&build_func)
.arg(&d_assignments)
.arg(&d_cluster_offsets)
.arg(&d_cluster_indices)
.arg(&d_cluster_positions)
.arg(&num_vectors)
.launch(LaunchConfig {
grid_dim: ((num_vectors + 255) / 256, 1, 1),
block_dim: (256, 1, 1),
shared_mem_bytes: 0
})?;
}
stream.synchronize()?;
let d_cluster_sizes = stream.memcpy_stod(&cluster_sizes)?;
let h_queries = if use_adaptive_probes {
stream.memcpy_dtov(&challenge.d_query_vectors)?
} else {
Vec::new()
};
let cluster_search = module.load_function("search_coalesced_multiquery")?;
let mut d_results = stream.alloc_zeros::<i32>(num_queries as usize)?;
let shared_mem_centroids = (num_centroids * dim * 4) as u32;
let max_shared = prop.sharedMemPerBlock as u32;
let use_shared_mem = shared_mem_centroids < (max_shared / 2);
let use_shared_flag = if use_shared_mem { 1i32 } else { 0i32 };
let batch_size = 256;
for query_batch_start in (0..num_queries).step_by(batch_size) {
let query_batch_end = (query_batch_start + batch_size as u32).min(num_queries);
let batch_count = query_batch_end - query_batch_start;
let all_probe_lists = if use_adaptive_probes {
let mut lists = Vec::new();
for q in query_batch_start..query_batch_end {
let query_start = (q * dim) as usize;
let query_end = query_start + dim as usize;
let query = &h_queries[query_start..query_end];
let probe_list = build_probe_list_adaptive(
query,
&centroids,
num_centroids,
dim,
total_search_clusters
);
lists.extend(probe_list);
}
lists
} else {
let fixed_list: Vec<u32> = (0..num_centroids).collect();
fixed_list.repeat(batch_count as usize)
};
let d_probe_lists = stream.memcpy_stod(&all_probe_lists)?;
let search_config = LaunchConfig {
grid_dim: (batch_count, 1, 1),
block_dim: (256, 1, 1),
shared_mem_bytes: if use_shared_mem { shared_mem_centroids } else { 0 },
};
unsafe {
stream.launch_builder(&cluster_search)
.arg(&challenge.d_query_vectors)
.arg(&challenge.d_database_vectors)
.arg(&d_centroids)
.arg(&d_cluster_indices)
.arg(&d_cluster_sizes)
.arg(&d_cluster_offsets)
.arg(&d_probe_lists)
.arg(&mut d_results)
.arg(&query_batch_start)
.arg(&batch_count)
.arg(&num_vectors)
.arg(&dim)
.arg(&num_centroids)
.arg(&total_search_clusters)
.arg(&use_shared_flag)
.launch(search_config)?;
}
stream.synchronize()?;
// SIGMA UPDATE: Save partial solution after each batch
// If we run out of fuel, this ensures we have results for completed queries
let partial_indices: Vec<i32> = stream.memcpy_dtov(&d_results)?;
let partial_indexes = partial_indices.iter().map(|&idx| idx as usize).collect();
save_solution(&Solution { indexes: partial_indexes })?;
}
// Final synchronization and solution save
stream.synchronize()?;
let indices: Vec<i32> = stream.memcpy_dtov(&d_results)?;
let indexes = indices.iter().map(|&idx| idx as usize).collect();
save_solution(&Solution { indexes })?;
Ok(())
}
pub fn help() {
println!("No help information provided.");
}

View File

@ -0,0 +1,280 @@
#include <cuda_runtime.h>
#include <float.h>
#define WARP_SIZE 32
extern "C" {
// Simple vectorized distance (FAST - no cross-warp reduction overhead)
__device__ __forceinline__ float squared_l2_distance_vectorized(
const float* __restrict__ a,
const float* __restrict__ b,
unsigned int dim
) {
float dist = 0.0f;
if (dim >= 4 && (dim % 4 == 0)) {
const float4* a4 = reinterpret_cast<const float4*>(a);
const float4* b4 = reinterpret_cast<const float4*>(b);
unsigned int n4 = dim / 4;
#pragma unroll 4
for (unsigned int i = 0; i < n4; i++) {
float4 va = a4[i];
float4 vb = b4[i];
float dx = va.x - vb.x;
float dy = va.y - vb.y;
float dz = va.z - vb.z;
float dw = va.w - vb.w;
dist += dx*dx + dy*dy + dz*dz + dw*dw;
}
} else {
#pragma unroll 8
for (unsigned int i = 0; i < dim; i++) {
float diff = a[i] - b[i];
dist += diff * diff;
}
}
return dist;
}
// Atomic minimum for float with index tracking
__device__ void atomicMinFloat(float* addr, int* idx_addr, float val, int val_idx) {
int* addr_as_int = (int*)addr;
int old = *addr_as_int;
int assumed;
while (true) {
assumed = old;
float current_val = __int_as_float(assumed);
if (val >= current_val) break;
old = atomicCAS(addr_as_int, assumed, __float_as_int(val));
if (assumed == old) {
*idx_addr = val_idx;
break;
}
}
}
// Centroid assignment kernel
__global__ void assign_to_nearest_centroid(
const float* __restrict__ vectors,
const float* __restrict__ centroids,
int* __restrict__ assignments,
unsigned int num_vectors,
unsigned int num_centroids,
unsigned int dim
) {
unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (vec_idx >= num_vectors) return;
const float* vec = vectors + vec_idx * dim;
float min_dist = FLT_MAX;
int best_centroid = 0;
for (unsigned int c = 0; c < num_centroids; c++) {
const float* cent = centroids + c * dim;
float dist = squared_l2_distance_vectorized(vec, cent, dim);
if (dist < min_dist) {
min_dist = dist;
best_centroid = c;
}
}
assignments[vec_idx] = best_centroid;
}
// FIXED: Multi-query search kernel with proper coalesced distance and synchronization
__global__ void search_coalesced_multiquery(
const float* __restrict__ queries,
const float* __restrict__ database,
const float* __restrict__ centroids,
const int* __restrict__ cluster_indices,
const int* __restrict__ cluster_sizes,
const int* __restrict__ cluster_offsets,
const int* __restrict__ probe_lists, // [batch_size * num_probes]
int* __restrict__ results,
int query_offset, // Starting query index for this batch
int batch_size, // Number of queries in this batch
int total_vectors,
int dim,
int num_centroids,
int num_probes,
int use_shared_mem
) {
extern __shared__ float shared_centroids[];
int qid_local = blockIdx.x; // Query within batch
if (qid_local >= batch_size) return;
int qid_global = query_offset + qid_local;
const float* query = queries + qid_global * dim;
// Load centroids into shared memory if enabled
if (use_shared_mem) {
int total_elements = num_centroids * dim;
for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
shared_centroids[i] = centroids[i];
}
__syncthreads();
}
// Get probe list for this query
const int* probe_list = probe_lists + qid_local * num_probes;
// Shared memory for block-level reduction
__shared__ float shared_best_dist;
__shared__ int shared_best_idx;
if (threadIdx.x == 0) {
shared_best_dist = FLT_MAX;
shared_best_idx = -1;
}
__syncthreads();
float local_best = FLT_MAX;
int local_idx = -1;
// Search all probed clusters
for (int p = 0; p < num_probes; p++) {
int cluster_id = probe_list[p];
if (cluster_id < 0 || cluster_id >= num_centroids) continue;
int offset = cluster_offsets[cluster_id];
int size = cluster_sizes[cluster_id];
// Each thread searches its own vectors
for (int i = threadIdx.x; i < size; i += blockDim.x) {
int vec_idx = cluster_indices[offset + i];
if (vec_idx >= total_vectors) continue;
const float* vec = database + vec_idx * dim;
// Simple vectorized distance (fast!)
float dist = squared_l2_distance_vectorized(query, vec, dim);
if (dist < local_best) {
local_best = dist;
local_idx = vec_idx;
}
}
}
// Warp-level reduction
int warp_id = threadIdx.x / WARP_SIZE;
int lane = threadIdx.x % WARP_SIZE;
float reduced_dist = local_best;
int reduced_idx = local_idx;
for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
float other_dist = __shfl_down_sync(0xFFFFFFFF, reduced_dist, offset);
int other_idx = __shfl_down_sync(0xFFFFFFFF, reduced_idx, offset);
if (other_dist < reduced_dist) {
reduced_dist = other_dist;
reduced_idx = other_idx;
}
}
// Store warp results
__shared__ float warp_mins[8]; // Max 256/32 = 8 warps
__shared__ int warp_ids[8];
if (lane == 0) {
warp_mins[warp_id] = reduced_dist;
warp_ids[warp_id] = reduced_idx;
}
__syncthreads();
// Block-level reduction (thread 0 only)
if (threadIdx.x == 0) {
float block_best = FLT_MAX;
int block_idx = -1;
int num_warps = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
for (int i = 0; i < num_warps; i++) {
if (warp_mins[i] < block_best) {
block_best = warp_mins[i];
block_idx = warp_ids[i];
}
}
if (block_idx != -1 && block_best < shared_best_dist) {
shared_best_dist = block_best;
shared_best_idx = block_idx;
}
}
__syncthreads();
// Write final result
if (threadIdx.x == 0) {
results[qid_global] = shared_best_idx;
}
}
// GPU kernel to copy selected vectors as centroids
__global__ void select_centroids_strided(
const float* __restrict__ vectors,
float* __restrict__ centroids,
unsigned int num_vectors,
unsigned int num_centroids,
unsigned int dim
) {
unsigned int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (c_idx >= num_centroids * dim) return;
unsigned int centroid_id = c_idx / dim;
unsigned int feature_id = c_idx % dim;
// Strided sampling: every N-th vector
unsigned int step = num_vectors / num_centroids;
if (step == 0) step = 1;
unsigned int vec_idx = (centroid_id * step) % num_vectors;
centroids[c_idx] = vectors[vec_idx * dim + feature_id];
}
// GPU kernel to count vectors per cluster (pass 1)
__global__ void count_cluster_sizes(
const int* __restrict__ assignments,
int* __restrict__ cluster_sizes,
unsigned int num_vectors
) {
unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (vec_idx >= num_vectors) return;
int cluster_id = assignments[vec_idx];
atomicAdd(&cluster_sizes[cluster_id], 1);
}
// GPU kernel to build cluster indices (pass 2)
__global__ void build_cluster_indices(
const int* __restrict__ assignments,
const int* __restrict__ cluster_offsets,
int* __restrict__ cluster_indices,
int* __restrict__ cluster_positions, // Temp array for atomic counters
unsigned int num_vectors
) {
unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (vec_idx >= num_vectors) return;
int cluster_id = assignments[vec_idx];
int offset = cluster_offsets[cluster_id];
int pos = atomicAdd(&cluster_positions[cluster_id], 1);
cluster_indices[offset + pos] = vec_idx;
}
} // extern "C"

View File

@ -152,7 +152,8 @@
// c004_a077
// c004_a078
pub mod autovector_native_v2;
pub use autovector_native_v2 as c004_a078;
// c004_a079