diff --git a/tig-algorithms/src/vector_search/autovector_native_v2/README.md b/tig-algorithms/src/vector_search/autovector_native_v2/README.md new file mode 100644 index 00000000..e9bc3b51 --- /dev/null +++ b/tig-algorithms/src/vector_search/autovector_native_v2/README.md @@ -0,0 +1,61 @@ +# AutoVector Native v2.0-beta +**High-Performance Vector Range Search for the TIG Challenge** +--- +## Submission Details + +* **Challenge Name:** vector_search +* **Algorithm Name:** autovector_native_v2 +* **Copyright:** 2025 Brent Beane +* **Identity of Submitter:** Brent Beane +* **Identity of Creator of Algorithmic Method:** null +* **Unique Algorithm Identifier (UAI):** null + +## License + +The files in this folder are under the following licenses: +* TIG Benchmarker Outbound License +* TIG Commercial License +* TIG Inbound Game License +* TIG Innovator Outbound Game License +* TIG Open Data License +* TIG THV Game License + +Copies of the licenses can be obtained at: +https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses + +## Overview + +AutoVector Native is a GPU-accelerated vector range search algorithm engineered for **maximum throughput**, **deterministic execution**, and **resilience under fuel-limited environments**. Designed specifically for the TIG Vector Search Challenge, it combines **adaptive clustering**, **coalesced memory access**, and **batched solution persistence** to deliver state-of-the-art performance without sacrificing compliance. + +Unlike conventional approaches that rely on complex synchronization or approximate filtering, AutoVector Native uses a **radical simplification strategy**: fixed small cluster counts, exhaustive intra-cluster search, and minimal kernel overhead — enabling predictable, scalable performance across difficulty levels. + +This version (`v2.0-beta`) introduces Sigma I & II compliance, dynamic probe scheduling, and hybrid CPU-GPU coordination for robust operation under real contest constraints. + +--- + +## Key Features + +| Feature | Description | +|-------|-------------| +| ✅ **Sigma I Compliant** | No oracle access; only uses permitted challenge fields | +| ✅ **Sigma II Compliant** | Saves partial solutions after every query batch (256 queries) | +| 🔁 **Adaptive Clustering** | Dynamically computes optimal centroid count based on database size, dimensionality, and query load | +| 🧠 **Multi-Probe Search** | Supports both exhaustive and adaptive probe strategies via hyperparameters | +| ⚙️ **GPU-Optimized Kernels** | All core operations (centroid selection, assignment, indexing, search) executed on GPU | +| 💾 **Progress Preservation** | Batched `save_solution()` ensures partial results survive fuel exhaustion | +| 🚀 **High Occupancy** | Uses 256-thread blocks, coalesced memory access, and shared memory where safe | + +--- + +## Algorithm Design + +### 1. **Centroid Initialization** +- Centroids selected via strided sampling from database vectors using `select_centroids_strided`. +- Number of centroids adaptively scaled: + +```rust + base = √N + dim_factor = 1.0 + ln(dim / 100) + query_factor = √(num_queries / 10) + adaptive_count = (base × dim_factor / query_factor).clamp(8, 64) + diff --git a/tig-algorithms/src/vector_search/autovector_native_v2/mod.rs b/tig-algorithms/src/vector_search/autovector_native_v2/mod.rs new file mode 100644 index 00000000..721ee41e --- /dev/null +++ b/tig-algorithms/src/vector_search/autovector_native_v2/mod.rs @@ -0,0 +1,276 @@ +use anyhow::Result; +use cudarc::driver::{safe::{LaunchConfig, CudaModule, CudaStream}, CudaSlice, PushKernelArg}; +use cudarc::runtime::sys::cudaDeviceProp; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value}; +use std::sync::Arc; +use tig_challenges::vector_search::*; + +#[derive(Serialize, Deserialize, Clone)] +pub struct Hyperparameters { + pub num_centroids: u32, + pub search_clusters: u32, + pub multi_probe_boost: u32, +} + +impl Default for Hyperparameters { + fn default() -> Self { + Self { + num_centroids: 4, + search_clusters: 2, + multi_probe_boost: 2, + } + } +} + +fn calculate_adaptive_clusters(database_size: u32, vector_dims: u32, num_queries: u32) -> u32 { + let base_clusters = (database_size as f32).sqrt() as u32; + let dim_factor = 1.0 + (vector_dims as f32 / 100.0).ln(); + let query_factor = (num_queries as f32 / 10.0).sqrt().max(1.0); + + let adaptive_count = (base_clusters as f32 * dim_factor / query_factor) as u32; + adaptive_count.clamp(8, 64) +} + +fn l2_squared(a: &[f32], b: &[f32]) -> f32 { + a.iter().zip(b.iter()).map(|(x, y)| (x - y).powi(2)).sum() +} + +fn build_probe_list_adaptive( + query: &[f32], + centroids: &[f32], + num_centroids: u32, + dim: u32, + total_probes: u32, +) -> Vec { + let mut distances: Vec<(u32, f32)> = (0..num_centroids) + .map(|i| { + let start = (i * dim) as usize; + let end = start + dim as usize; + let centroid = ¢roids[start..end]; + let dist = l2_squared(query, centroid); + (i, dist) + }) + .collect(); + + distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + + distances.into_iter() + .take(total_probes.min(num_centroids) as usize) + .map(|(i, _)| i) + .collect() +} + +pub fn solve_challenge( + challenge: &Challenge, + save_solution: &dyn Fn(&Solution) -> Result<()>, + hyperparameters: &Option>, + module: Arc, + stream: Arc, + prop: &cudaDeviceProp, +) -> Result<()> { + let hps: Hyperparameters = hyperparameters.as_ref() + .and_then(|m| serde_json::from_value(serde_json::Value::Object(m.clone())).ok()) + .unwrap_or_default(); + + let dim = challenge.vector_dims as u32; + let num_vectors = challenge.database_size as u32; + let num_queries = challenge.num_queries as u32; + + let adaptive_centroids = calculate_adaptive_clusters(num_vectors, dim, num_queries); + let num_centroids = hps.num_centroids.min(adaptive_centroids); + let total_search_clusters = (hps.search_clusters + hps.multi_probe_boost).min(num_centroids); + + let use_adaptive_probes = total_search_clusters < num_centroids; + + // Select centroids on GPU + let d_centroids = stream.alloc_zeros::((num_centroids * dim) as usize)?; + let select_func = module.load_function("select_centroids_strided")?; + + let total_threads = num_centroids * dim; + unsafe { + stream.launch_builder(&select_func) + .arg(&challenge.d_database_vectors) + .arg(&d_centroids) + .arg(&num_vectors) + .arg(&num_centroids) + .arg(&dim) + .launch(LaunchConfig { + grid_dim: ((total_threads + 255) / 256, 1, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0 + })?; + } + + stream.synchronize()?; + + let centroids = if use_adaptive_probes { + stream.memcpy_dtov(&d_centroids)? + } else { + Vec::new() + }; + + // Assign vectors to nearest centroids + let d_assignments = stream.alloc_zeros::(num_vectors as usize)?; + let assign_func = module.load_function("assign_to_nearest_centroid")?; + + unsafe { + stream.launch_builder(&assign_func) + .arg(&challenge.d_database_vectors) + .arg(&d_centroids) + .arg(&d_assignments) + .arg(&num_vectors) + .arg(&num_centroids) + .arg(&dim) + .launch(LaunchConfig { + grid_dim: ((num_vectors + 255) / 256, 1, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0 + })?; + } + + stream.synchronize()?; + + // GPU-based cluster building + let d_cluster_sizes = stream.alloc_zeros::(num_centroids as usize)?; + let count_func = module.load_function("count_cluster_sizes")?; + + unsafe { + stream.launch_builder(&count_func) + .arg(&d_assignments) + .arg(&d_cluster_sizes) + .arg(&num_vectors) + .launch(LaunchConfig { + grid_dim: ((num_vectors + 255) / 256, 1, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0 + })?; + } + + stream.synchronize()?; + + let cluster_sizes: Vec = stream.memcpy_dtov(&d_cluster_sizes)?; + + let mut cluster_offsets: Vec = vec![0]; + let mut total = 0i32; + for &size in &cluster_sizes { + total += size; + cluster_offsets.push(total); + } + + let d_cluster_indices = stream.alloc_zeros::(num_vectors as usize)?; + let d_cluster_offsets = stream.memcpy_stod(&cluster_offsets)?; + let d_cluster_positions = stream.alloc_zeros::(num_centroids as usize)?; + + let build_func = module.load_function("build_cluster_indices")?; + unsafe { + stream.launch_builder(&build_func) + .arg(&d_assignments) + .arg(&d_cluster_offsets) + .arg(&d_cluster_indices) + .arg(&d_cluster_positions) + .arg(&num_vectors) + .launch(LaunchConfig { + grid_dim: ((num_vectors + 255) / 256, 1, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0 + })?; + } + + stream.synchronize()?; + + let d_cluster_sizes = stream.memcpy_stod(&cluster_sizes)?; + + let h_queries = if use_adaptive_probes { + stream.memcpy_dtov(&challenge.d_query_vectors)? + } else { + Vec::new() + }; + + let cluster_search = module.load_function("search_coalesced_multiquery")?; + let mut d_results = stream.alloc_zeros::(num_queries as usize)?; + + let shared_mem_centroids = (num_centroids * dim * 4) as u32; + let max_shared = prop.sharedMemPerBlock as u32; + let use_shared_mem = shared_mem_centroids < (max_shared / 2); + let use_shared_flag = if use_shared_mem { 1i32 } else { 0i32 }; + + let batch_size = 256; + + for query_batch_start in (0..num_queries).step_by(batch_size) { + let query_batch_end = (query_batch_start + batch_size as u32).min(num_queries); + let batch_count = query_batch_end - query_batch_start; + + let all_probe_lists = if use_adaptive_probes { + let mut lists = Vec::new(); + for q in query_batch_start..query_batch_end { + let query_start = (q * dim) as usize; + let query_end = query_start + dim as usize; + let query = &h_queries[query_start..query_end]; + + let probe_list = build_probe_list_adaptive( + query, + ¢roids, + num_centroids, + dim, + total_search_clusters + ); + lists.extend(probe_list); + } + lists + } else { + let fixed_list: Vec = (0..num_centroids).collect(); + fixed_list.repeat(batch_count as usize) + }; + + let d_probe_lists = stream.memcpy_stod(&all_probe_lists)?; + + let search_config = LaunchConfig { + grid_dim: (batch_count, 1, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: if use_shared_mem { shared_mem_centroids } else { 0 }, + }; + + unsafe { + stream.launch_builder(&cluster_search) + .arg(&challenge.d_query_vectors) + .arg(&challenge.d_database_vectors) + .arg(&d_centroids) + .arg(&d_cluster_indices) + .arg(&d_cluster_sizes) + .arg(&d_cluster_offsets) + .arg(&d_probe_lists) + .arg(&mut d_results) + .arg(&query_batch_start) + .arg(&batch_count) + .arg(&num_vectors) + .arg(&dim) + .arg(&num_centroids) + .arg(&total_search_clusters) + .arg(&use_shared_flag) + .launch(search_config)?; + } + + stream.synchronize()?; + + // SIGMA UPDATE: Save partial solution after each batch + // If we run out of fuel, this ensures we have results for completed queries + let partial_indices: Vec = stream.memcpy_dtov(&d_results)?; + let partial_indexes = partial_indices.iter().map(|&idx| idx as usize).collect(); + save_solution(&Solution { indexes: partial_indexes })?; + } + + // Final synchronization and solution save + stream.synchronize()?; + + let indices: Vec = stream.memcpy_dtov(&d_results)?; + let indexes = indices.iter().map(|&idx| idx as usize).collect(); + + save_solution(&Solution { indexes })?; + + Ok(()) +} + +pub fn help() { + println!("No help information provided."); +} diff --git a/tig-algorithms/src/vector_search/autovector_native_v2/my_first_algo.cu b/tig-algorithms/src/vector_search/autovector_native_v2/my_first_algo.cu new file mode 100644 index 00000000..137f98ba --- /dev/null +++ b/tig-algorithms/src/vector_search/autovector_native_v2/my_first_algo.cu @@ -0,0 +1,280 @@ +#include +#include + +#define WARP_SIZE 32 + +extern "C" { + +// Simple vectorized distance (FAST - no cross-warp reduction overhead) +__device__ __forceinline__ float squared_l2_distance_vectorized( + const float* __restrict__ a, + const float* __restrict__ b, + unsigned int dim +) { + float dist = 0.0f; + + if (dim >= 4 && (dim % 4 == 0)) { + const float4* a4 = reinterpret_cast(a); + const float4* b4 = reinterpret_cast(b); + unsigned int n4 = dim / 4; + + #pragma unroll 4 + for (unsigned int i = 0; i < n4; i++) { + float4 va = a4[i]; + float4 vb = b4[i]; + + float dx = va.x - vb.x; + float dy = va.y - vb.y; + float dz = va.z - vb.z; + float dw = va.w - vb.w; + + dist += dx*dx + dy*dy + dz*dz + dw*dw; + } + } else { + #pragma unroll 8 + for (unsigned int i = 0; i < dim; i++) { + float diff = a[i] - b[i]; + dist += diff * diff; + } + } + + return dist; +} + +// Atomic minimum for float with index tracking +__device__ void atomicMinFloat(float* addr, int* idx_addr, float val, int val_idx) { + int* addr_as_int = (int*)addr; + int old = *addr_as_int; + int assumed; + + while (true) { + assumed = old; + float current_val = __int_as_float(assumed); + + if (val >= current_val) break; + + old = atomicCAS(addr_as_int, assumed, __float_as_int(val)); + + if (assumed == old) { + *idx_addr = val_idx; + break; + } + } +} + +// Centroid assignment kernel +__global__ void assign_to_nearest_centroid( + const float* __restrict__ vectors, + const float* __restrict__ centroids, + int* __restrict__ assignments, + unsigned int num_vectors, + unsigned int num_centroids, + unsigned int dim +) { + unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (vec_idx >= num_vectors) return; + + const float* vec = vectors + vec_idx * dim; + float min_dist = FLT_MAX; + int best_centroid = 0; + + for (unsigned int c = 0; c < num_centroids; c++) { + const float* cent = centroids + c * dim; + float dist = squared_l2_distance_vectorized(vec, cent, dim); + + if (dist < min_dist) { + min_dist = dist; + best_centroid = c; + } + } + + assignments[vec_idx] = best_centroid; +} + +// FIXED: Multi-query search kernel with proper coalesced distance and synchronization +__global__ void search_coalesced_multiquery( + const float* __restrict__ queries, + const float* __restrict__ database, + const float* __restrict__ centroids, + const int* __restrict__ cluster_indices, + const int* __restrict__ cluster_sizes, + const int* __restrict__ cluster_offsets, + const int* __restrict__ probe_lists, // [batch_size * num_probes] + int* __restrict__ results, + int query_offset, // Starting query index for this batch + int batch_size, // Number of queries in this batch + int total_vectors, + int dim, + int num_centroids, + int num_probes, + int use_shared_mem +) { + extern __shared__ float shared_centroids[]; + + int qid_local = blockIdx.x; // Query within batch + if (qid_local >= batch_size) return; + + int qid_global = query_offset + qid_local; + const float* query = queries + qid_global * dim; + + // Load centroids into shared memory if enabled + if (use_shared_mem) { + int total_elements = num_centroids * dim; + for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { + shared_centroids[i] = centroids[i]; + } + __syncthreads(); + } + + // Get probe list for this query + const int* probe_list = probe_lists + qid_local * num_probes; + + // Shared memory for block-level reduction + __shared__ float shared_best_dist; + __shared__ int shared_best_idx; + + if (threadIdx.x == 0) { + shared_best_dist = FLT_MAX; + shared_best_idx = -1; + } + __syncthreads(); + + float local_best = FLT_MAX; + int local_idx = -1; + + // Search all probed clusters + for (int p = 0; p < num_probes; p++) { + int cluster_id = probe_list[p]; + if (cluster_id < 0 || cluster_id >= num_centroids) continue; + + int offset = cluster_offsets[cluster_id]; + int size = cluster_sizes[cluster_id]; + + // Each thread searches its own vectors + for (int i = threadIdx.x; i < size; i += blockDim.x) { + int vec_idx = cluster_indices[offset + i]; + if (vec_idx >= total_vectors) continue; + + const float* vec = database + vec_idx * dim; + + // Simple vectorized distance (fast!) + float dist = squared_l2_distance_vectorized(query, vec, dim); + + if (dist < local_best) { + local_best = dist; + local_idx = vec_idx; + } + } + } + + // Warp-level reduction + int warp_id = threadIdx.x / WARP_SIZE; + int lane = threadIdx.x % WARP_SIZE; + + float reduced_dist = local_best; + int reduced_idx = local_idx; + + for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) { + float other_dist = __shfl_down_sync(0xFFFFFFFF, reduced_dist, offset); + int other_idx = __shfl_down_sync(0xFFFFFFFF, reduced_idx, offset); + + if (other_dist < reduced_dist) { + reduced_dist = other_dist; + reduced_idx = other_idx; + } + } + + // Store warp results + __shared__ float warp_mins[8]; // Max 256/32 = 8 warps + __shared__ int warp_ids[8]; + + if (lane == 0) { + warp_mins[warp_id] = reduced_dist; + warp_ids[warp_id] = reduced_idx; + } + __syncthreads(); + + // Block-level reduction (thread 0 only) + if (threadIdx.x == 0) { + float block_best = FLT_MAX; + int block_idx = -1; + int num_warps = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE; + + for (int i = 0; i < num_warps; i++) { + if (warp_mins[i] < block_best) { + block_best = warp_mins[i]; + block_idx = warp_ids[i]; + } + } + + if (block_idx != -1 && block_best < shared_best_dist) { + shared_best_dist = block_best; + shared_best_idx = block_idx; + } + } + __syncthreads(); + + // Write final result + if (threadIdx.x == 0) { + results[qid_global] = shared_best_idx; + } +} + +// GPU kernel to copy selected vectors as centroids +__global__ void select_centroids_strided( + const float* __restrict__ vectors, + float* __restrict__ centroids, + unsigned int num_vectors, + unsigned int num_centroids, + unsigned int dim +) { + unsigned int c_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (c_idx >= num_centroids * dim) return; + + unsigned int centroid_id = c_idx / dim; + unsigned int feature_id = c_idx % dim; + + // Strided sampling: every N-th vector + unsigned int step = num_vectors / num_centroids; + if (step == 0) step = 1; + unsigned int vec_idx = (centroid_id * step) % num_vectors; + + centroids[c_idx] = vectors[vec_idx * dim + feature_id]; +} + +// GPU kernel to count vectors per cluster (pass 1) +__global__ void count_cluster_sizes( + const int* __restrict__ assignments, + int* __restrict__ cluster_sizes, + unsigned int num_vectors +) { + unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (vec_idx >= num_vectors) return; + + int cluster_id = assignments[vec_idx]; + atomicAdd(&cluster_sizes[cluster_id], 1); +} + +// GPU kernel to build cluster indices (pass 2) +__global__ void build_cluster_indices( + const int* __restrict__ assignments, + const int* __restrict__ cluster_offsets, + int* __restrict__ cluster_indices, + int* __restrict__ cluster_positions, // Temp array for atomic counters + unsigned int num_vectors +) { + unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x; + + if (vec_idx >= num_vectors) return; + + int cluster_id = assignments[vec_idx]; + int offset = cluster_offsets[cluster_id]; + int pos = atomicAdd(&cluster_positions[cluster_id], 1); + + cluster_indices[offset + pos] = vec_idx; +} + +} // extern "C" diff --git a/tig-algorithms/src/vector_search/mod.rs b/tig-algorithms/src/vector_search/mod.rs index ae3472ac..7b4d0bb9 100644 --- a/tig-algorithms/src/vector_search/mod.rs +++ b/tig-algorithms/src/vector_search/mod.rs @@ -152,7 +152,8 @@ // c004_a077 -// c004_a078 +pub mod autovector_native_v2; +pub use autovector_native_v2 as c004_a078; // c004_a079