Submitted vector_search/autovector_native_v2

2026-02-21 10:27:49 +08:00 · 2025-12-03 16:49:36 +00:00 · 2025-12-03 16:49:36 +00:00 · 9cd631cbd9
commit 9cd631cbd9
parent bdc6ed6794
4 changed files with 619 additions and 1 deletions
--- a/tig-algorithms/src/vector_search/autovector_native_v2/README.md
+++ b/tig-algorithms/src/vector_search/autovector_native_v2/README.md
@ -0,0 +1,61 @@
+# AutoVector Native v2.0-beta  
+**High-Performance Vector Range Search for the TIG Challenge**  
+---
+## Submission Details
+
+* **Challenge Name:** vector_search
+* **Algorithm Name:** autovector_native_v2
+* **Copyright:** 2025 Brent Beane
+* **Identity of Submitter:** Brent Beane
+* **Identity of Creator of Algorithmic Method:** null
+* **Unique Algorithm Identifier (UAI):** null
+
+## License
+
+The files in this folder are under the following licenses:
+* TIG Benchmarker Outbound License
+* TIG Commercial License
+* TIG Inbound Game License
+* TIG Innovator Outbound Game License
+* TIG Open Data License
+* TIG THV Game License
+
+Copies of the licenses can be obtained at:
+https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses
+
+## Overview
+
+AutoVector Native is a GPU-accelerated vector range search algorithm engineered for **maximum throughput**, **deterministic execution**, and **resilience under fuel-limited environments**. Designed specifically for the TIG Vector Search Challenge, it combines **adaptive clustering**, **coalesced memory access**, and **batched solution persistence** to deliver state-of-the-art performance without sacrificing compliance.
+
+Unlike conventional approaches that rely on complex synchronization or approximate filtering, AutoVector Native uses a **radical simplification strategy**: fixed small cluster counts, exhaustive intra-cluster search, and minimal kernel overhead — enabling predictable, scalable performance across difficulty levels.
+
+This version (`v2.0-beta`) introduces Sigma I & II compliance, dynamic probe scheduling, and hybrid CPU-GPU coordination for robust operation under real contest constraints.
+
+---
+
+## Key Features
+
+| Feature | Description |
+|-------|-------------|
+| ✅ **Sigma I Compliant** | No oracle access; only uses permitted challenge fields |
+| ✅ **Sigma II Compliant** | Saves partial solutions after every query batch (256 queries) |
+| 🔁 **Adaptive Clustering** | Dynamically computes optimal centroid count based on database size, dimensionality, and query load |
+| 🧠 **Multi-Probe Search** | Supports both exhaustive and adaptive probe strategies via hyperparameters |
+| ⚙️ **GPU-Optimized Kernels** | All core operations (centroid selection, assignment, indexing, search) executed on GPU |
+| 💾 **Progress Preservation** | Batched `save_solution()` ensures partial results survive fuel exhaustion |
+| 🚀 **High Occupancy** | Uses 256-thread blocks, coalesced memory access, and shared memory where safe |
+
+---
+
+## Algorithm Design
+
+### 1. **Centroid Initialization**
+- Centroids selected via strided sampling from database vectors using `select_centroids_strided`.
+- Number of centroids adaptively scaled:
+  
+```rust
+  base = √N
+  dim_factor = 1.0 + ln(dim / 100)
+  query_factor = √(num_queries / 10)
+  adaptive_count = (base × dim_factor / query_factor).clamp(8, 64)
+  
--- a/tig-algorithms/src/vector_search/autovector_native_v2/mod.rs
+++ b/tig-algorithms/src/vector_search/autovector_native_v2/mod.rs
@ -0,0 +1,276 @@
+use anyhow::Result;
+use cudarc::driver::{safe::{LaunchConfig, CudaModule, CudaStream}, CudaSlice, PushKernelArg};
+use cudarc::runtime::sys::cudaDeviceProp;
+use serde::{Deserialize, Serialize};
+use serde_json::{Map, Value};
+use std::sync::Arc;
+use tig_challenges::vector_search::*;
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct Hyperparameters {
+    pub num_centroids: u32,
+    pub search_clusters: u32,
+    pub multi_probe_boost: u32,
+}
+
+impl Default for Hyperparameters {
+    fn default() -> Self {
+        Self { 
+            num_centroids: 4,
+            search_clusters: 2,
+            multi_probe_boost: 2,
+        }
+    }
+}
+
+fn calculate_adaptive_clusters(database_size: u32, vector_dims: u32, num_queries: u32) -> u32 {
+    let base_clusters = (database_size as f32).sqrt() as u32;
+    let dim_factor = 1.0 + (vector_dims as f32 / 100.0).ln();
+    let query_factor = (num_queries as f32 / 10.0).sqrt().max(1.0);
+    
+    let adaptive_count = (base_clusters as f32 * dim_factor / query_factor) as u32;
+    adaptive_count.clamp(8, 64)
+}
+
+fn l2_squared(a: &[f32], b: &[f32]) -> f32 {
+    a.iter().zip(b.iter()).map(|(x, y)| (x - y).powi(2)).sum()
+}
+
+fn build_probe_list_adaptive(
+    query: &[f32],
+    centroids: &[f32],
+    num_centroids: u32,
+    dim: u32,
+    total_probes: u32,
+) -> Vec<u32> {
+    let mut distances: Vec<(u32, f32)> = (0..num_centroids)
+        .map(|i| {
+            let start = (i * dim) as usize;
+            let end = start + dim as usize;
+            let centroid = &centroids[start..end];
+            let dist = l2_squared(query, centroid);
+            (i, dist)
+        })
+        .collect();
+    
+    distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
+    
+    distances.into_iter()
+        .take(total_probes.min(num_centroids) as usize)
+        .map(|(i, _)| i)
+        .collect()
+}
+
+pub fn solve_challenge(
+    challenge: &Challenge,
+    save_solution: &dyn Fn(&Solution) -> Result<()>,
+    hyperparameters: &Option<Map<String, Value>>,
+    module: Arc<CudaModule>,
+    stream: Arc<CudaStream>,
+    prop: &cudaDeviceProp,
+) -> Result<()> {
+    let hps: Hyperparameters = hyperparameters.as_ref()
+        .and_then(|m| serde_json::from_value(serde_json::Value::Object(m.clone())).ok())
+        .unwrap_or_default();
+    
+    let dim = challenge.vector_dims as u32;
+    let num_vectors = challenge.database_size as u32;
+    let num_queries = challenge.num_queries as u32;
+
+    let adaptive_centroids = calculate_adaptive_clusters(num_vectors, dim, num_queries);
+    let num_centroids = hps.num_centroids.min(adaptive_centroids);
+    let total_search_clusters = (hps.search_clusters + hps.multi_probe_boost).min(num_centroids);
+    
+    let use_adaptive_probes = total_search_clusters < num_centroids;
+
+    // Select centroids on GPU
+    let d_centroids = stream.alloc_zeros::<f32>((num_centroids * dim) as usize)?;
+    let select_func = module.load_function("select_centroids_strided")?;
+    
+    let total_threads = num_centroids * dim;
+    unsafe {
+        stream.launch_builder(&select_func)
+            .arg(&challenge.d_database_vectors)
+            .arg(&d_centroids)
+            .arg(&num_vectors)
+            .arg(&num_centroids)
+            .arg(&dim)
+            .launch(LaunchConfig {
+                grid_dim: ((total_threads + 255) / 256, 1, 1),
+                block_dim: (256, 1, 1),
+                shared_mem_bytes: 0
+            })?;
+    }
+    
+    stream.synchronize()?;
+    
+    let centroids = if use_adaptive_probes {
+        stream.memcpy_dtov(&d_centroids)?
+    } else {
+        Vec::new()
+    };
+    
+    // Assign vectors to nearest centroids
+    let d_assignments = stream.alloc_zeros::<i32>(num_vectors as usize)?;
+    let assign_func = module.load_function("assign_to_nearest_centroid")?;
+    
+    unsafe {
+        stream.launch_builder(&assign_func)
+            .arg(&challenge.d_database_vectors)
+            .arg(&d_centroids)
+            .arg(&d_assignments)
+            .arg(&num_vectors)
+            .arg(&num_centroids)
+            .arg(&dim)
+            .launch(LaunchConfig { 
+                grid_dim: ((num_vectors + 255) / 256, 1, 1), 
+                block_dim: (256, 1, 1), 
+                shared_mem_bytes: 0 
+            })?;
+    }
+    
+    stream.synchronize()?;
+    
+    // GPU-based cluster building
+    let d_cluster_sizes = stream.alloc_zeros::<i32>(num_centroids as usize)?;
+    let count_func = module.load_function("count_cluster_sizes")?;
+    
+    unsafe {
+        stream.launch_builder(&count_func)
+            .arg(&d_assignments)
+            .arg(&d_cluster_sizes)
+            .arg(&num_vectors)
+            .launch(LaunchConfig {
+                grid_dim: ((num_vectors + 255) / 256, 1, 1),
+                block_dim: (256, 1, 1),
+                shared_mem_bytes: 0
+            })?;
+    }
+    
+    stream.synchronize()?;
+    
+    let cluster_sizes: Vec<i32> = stream.memcpy_dtov(&d_cluster_sizes)?;
+    
+    let mut cluster_offsets: Vec<i32> = vec![0];
+    let mut total = 0i32;
+    for &size in &cluster_sizes {
+        total += size;
+        cluster_offsets.push(total);
+    }
+    
+    let d_cluster_indices = stream.alloc_zeros::<i32>(num_vectors as usize)?;
+    let d_cluster_offsets = stream.memcpy_stod(&cluster_offsets)?;
+    let d_cluster_positions = stream.alloc_zeros::<i32>(num_centroids as usize)?;
+    
+    let build_func = module.load_function("build_cluster_indices")?;
+    unsafe {
+        stream.launch_builder(&build_func)
+            .arg(&d_assignments)
+            .arg(&d_cluster_offsets)
+            .arg(&d_cluster_indices)
+            .arg(&d_cluster_positions)
+            .arg(&num_vectors)
+            .launch(LaunchConfig {
+                grid_dim: ((num_vectors + 255) / 256, 1, 1),
+                block_dim: (256, 1, 1),
+                shared_mem_bytes: 0
+            })?;
+    }
+    
+    stream.synchronize()?;
+    
+    let d_cluster_sizes = stream.memcpy_stod(&cluster_sizes)?;
+    
+    let h_queries = if use_adaptive_probes {
+        stream.memcpy_dtov(&challenge.d_query_vectors)?
+    } else {
+        Vec::new()
+    };
+    
+    let cluster_search = module.load_function("search_coalesced_multiquery")?;
+    let mut d_results = stream.alloc_zeros::<i32>(num_queries as usize)?;
+    
+    let shared_mem_centroids = (num_centroids * dim * 4) as u32;
+    let max_shared = prop.sharedMemPerBlock as u32;
+    let use_shared_mem = shared_mem_centroids < (max_shared / 2);
+    let use_shared_flag = if use_shared_mem { 1i32 } else { 0i32 };
+    
+    let batch_size = 256;
+    
+    for query_batch_start in (0..num_queries).step_by(batch_size) {
+        let query_batch_end = (query_batch_start + batch_size as u32).min(num_queries);
+        let batch_count = query_batch_end - query_batch_start;
+        
+        let all_probe_lists = if use_adaptive_probes {
+            let mut lists = Vec::new();
+            for q in query_batch_start..query_batch_end {
+                let query_start = (q * dim) as usize;
+                let query_end = query_start + dim as usize;
+                let query = &h_queries[query_start..query_end];
+                
+                let probe_list = build_probe_list_adaptive(
+                    query,
+                    &centroids,
+                    num_centroids,
+                    dim,
+                    total_search_clusters
+                );
+                lists.extend(probe_list);
+            }
+            lists
+        } else {
+            let fixed_list: Vec<u32> = (0..num_centroids).collect();
+            fixed_list.repeat(batch_count as usize)
+        };
+        
+        let d_probe_lists = stream.memcpy_stod(&all_probe_lists)?;
+        
+        let search_config = LaunchConfig {
+            grid_dim: (batch_count, 1, 1),
+            block_dim: (256, 1, 1),
+            shared_mem_bytes: if use_shared_mem { shared_mem_centroids } else { 0 },
+        };
+
+        unsafe {
+            stream.launch_builder(&cluster_search)
+                .arg(&challenge.d_query_vectors)
+                .arg(&challenge.d_database_vectors)
+                .arg(&d_centroids)
+                .arg(&d_cluster_indices)
+                .arg(&d_cluster_sizes)
+                .arg(&d_cluster_offsets)
+                .arg(&d_probe_lists)
+                .arg(&mut d_results)
+                .arg(&query_batch_start)
+                .arg(&batch_count)
+                .arg(&num_vectors)
+                .arg(&dim)
+                .arg(&num_centroids)
+                .arg(&total_search_clusters)
+                .arg(&use_shared_flag)
+                .launch(search_config)?;
+        }
+        
+        stream.synchronize()?;
+        
+        // SIGMA UPDATE: Save partial solution after each batch
+        // If we run out of fuel, this ensures we have results for completed queries
+        let partial_indices: Vec<i32> = stream.memcpy_dtov(&d_results)?;
+        let partial_indexes = partial_indices.iter().map(|&idx| idx as usize).collect();
+        save_solution(&Solution { indexes: partial_indexes })?;
+    }
+    
+    // Final synchronization and solution save
+    stream.synchronize()?;
+    
+    let indices: Vec<i32> = stream.memcpy_dtov(&d_results)?;
+    let indexes = indices.iter().map(|&idx| idx as usize).collect();
+    
+    save_solution(&Solution { indexes })?;
+    
+    Ok(())
+}
+
+pub fn help() {
+    println!("No help information provided.");
+}
--- a/tig-algorithms/src/vector_search/autovector_native_v2/my_first_algo.cu
+++ b/tig-algorithms/src/vector_search/autovector_native_v2/my_first_algo.cu
@ -0,0 +1,280 @@
+#include <cuda_runtime.h>
+#include <float.h>
+
+#define WARP_SIZE 32
+
+extern "C" {
+
+// Simple vectorized distance (FAST - no cross-warp reduction overhead)
+__device__ __forceinline__ float squared_l2_distance_vectorized(
+    const float* __restrict__ a, 
+    const float* __restrict__ b, 
+    unsigned int dim
+) {
+    float dist = 0.0f;
+    
+    if (dim >= 4 && (dim % 4 == 0)) {
+        const float4* a4 = reinterpret_cast<const float4*>(a);
+        const float4* b4 = reinterpret_cast<const float4*>(b);
+        unsigned int n4 = dim / 4;
+        
+        #pragma unroll 4
+        for (unsigned int i = 0; i < n4; i++) {
+            float4 va = a4[i];
+            float4 vb = b4[i];
+            
+            float dx = va.x - vb.x;
+            float dy = va.y - vb.y;
+            float dz = va.z - vb.z;
+            float dw = va.w - vb.w;
+            
+            dist += dx*dx + dy*dy + dz*dz + dw*dw;
+        }
+    } else {
+        #pragma unroll 8
+        for (unsigned int i = 0; i < dim; i++) {
+            float diff = a[i] - b[i];
+            dist += diff * diff;
+        }
+    }
+    
+    return dist;
+}
+
+// Atomic minimum for float with index tracking
+__device__ void atomicMinFloat(float* addr, int* idx_addr, float val, int val_idx) {
+    int* addr_as_int = (int*)addr;
+    int old = *addr_as_int;
+    int assumed;
+    
+    while (true) {
+        assumed = old;
+        float current_val = __int_as_float(assumed);
+        
+        if (val >= current_val) break;
+        
+        old = atomicCAS(addr_as_int, assumed, __float_as_int(val));
+        
+        if (assumed == old) {
+            *idx_addr = val_idx;
+            break;
+        }
+    }
+}
+
+// Centroid assignment kernel
+__global__ void assign_to_nearest_centroid(
+    const float* __restrict__ vectors,
+    const float* __restrict__ centroids,
+    int* __restrict__ assignments,
+    unsigned int num_vectors,
+    unsigned int num_centroids,
+    unsigned int dim
+) {
+    unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (vec_idx >= num_vectors) return;
+    
+    const float* vec = vectors + vec_idx * dim;
+    float min_dist = FLT_MAX;
+    int best_centroid = 0;
+    
+    for (unsigned int c = 0; c < num_centroids; c++) {
+        const float* cent = centroids + c * dim;
+        float dist = squared_l2_distance_vectorized(vec, cent, dim);
+        
+        if (dist < min_dist) {
+            min_dist = dist;
+            best_centroid = c;
+        }
+    }
+    
+    assignments[vec_idx] = best_centroid;
+}
+
+// FIXED: Multi-query search kernel with proper coalesced distance and synchronization
+__global__ void search_coalesced_multiquery(
+    const float* __restrict__ queries,
+    const float* __restrict__ database,
+    const float* __restrict__ centroids,
+    const int* __restrict__ cluster_indices,
+    const int* __restrict__ cluster_sizes,
+    const int* __restrict__ cluster_offsets,
+    const int* __restrict__ probe_lists,  // [batch_size * num_probes]
+    int* __restrict__ results,
+    int query_offset,          // Starting query index for this batch
+    int batch_size,            // Number of queries in this batch
+    int total_vectors,
+    int dim,
+    int num_centroids,
+    int num_probes,
+    int use_shared_mem
+) {
+    extern __shared__ float shared_centroids[];
+    
+    int qid_local = blockIdx.x;  // Query within batch
+    if (qid_local >= batch_size) return;
+    
+    int qid_global = query_offset + qid_local;
+    const float* query = queries + qid_global * dim;
+    
+    // Load centroids into shared memory if enabled
+    if (use_shared_mem) {
+        int total_elements = num_centroids * dim;
+        for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
+            shared_centroids[i] = centroids[i];
+        }
+        __syncthreads();
+    }
+    
+    // Get probe list for this query
+    const int* probe_list = probe_lists + qid_local * num_probes;
+    
+    // Shared memory for block-level reduction
+    __shared__ float shared_best_dist;
+    __shared__ int shared_best_idx;
+    
+    if (threadIdx.x == 0) {
+        shared_best_dist = FLT_MAX;
+        shared_best_idx = -1;
+    }
+    __syncthreads();
+    
+    float local_best = FLT_MAX;
+    int local_idx = -1;
+    
+    // Search all probed clusters
+    for (int p = 0; p < num_probes; p++) {
+        int cluster_id = probe_list[p];
+        if (cluster_id < 0 || cluster_id >= num_centroids) continue;
+        
+        int offset = cluster_offsets[cluster_id];
+        int size = cluster_sizes[cluster_id];
+        
+        // Each thread searches its own vectors
+        for (int i = threadIdx.x; i < size; i += blockDim.x) {
+            int vec_idx = cluster_indices[offset + i];
+            if (vec_idx >= total_vectors) continue;
+            
+            const float* vec = database + vec_idx * dim;
+            
+            // Simple vectorized distance (fast!)
+            float dist = squared_l2_distance_vectorized(query, vec, dim);
+            
+            if (dist < local_best) {
+                local_best = dist;
+                local_idx = vec_idx;
+            }
+        }
+    }
+    
+    // Warp-level reduction
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int lane = threadIdx.x % WARP_SIZE;
+    
+    float reduced_dist = local_best;
+    int reduced_idx = local_idx;
+    
+    for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
+        float other_dist = __shfl_down_sync(0xFFFFFFFF, reduced_dist, offset);
+        int other_idx = __shfl_down_sync(0xFFFFFFFF, reduced_idx, offset);
+        
+        if (other_dist < reduced_dist) {
+            reduced_dist = other_dist;
+            reduced_idx = other_idx;
+        }
+    }
+    
+    // Store warp results
+    __shared__ float warp_mins[8];  // Max 256/32 = 8 warps
+    __shared__ int warp_ids[8];
+    
+    if (lane == 0) {
+        warp_mins[warp_id] = reduced_dist;
+        warp_ids[warp_id] = reduced_idx;
+    }
+    __syncthreads();
+    
+    // Block-level reduction (thread 0 only)
+    if (threadIdx.x == 0) {
+        float block_best = FLT_MAX;
+        int block_idx = -1;
+        int num_warps = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
+        
+        for (int i = 0; i < num_warps; i++) {
+            if (warp_mins[i] < block_best) {
+                block_best = warp_mins[i];
+                block_idx = warp_ids[i];
+            }
+        }
+        
+        if (block_idx != -1 && block_best < shared_best_dist) {
+            shared_best_dist = block_best;
+            shared_best_idx = block_idx;
+        }
+    }
+    __syncthreads();
+    
+    // Write final result
+    if (threadIdx.x == 0) {
+        results[qid_global] = shared_best_idx;
+    }
+}
+
+// GPU kernel to copy selected vectors as centroids
+__global__ void select_centroids_strided(
+    const float* __restrict__ vectors,
+    float* __restrict__ centroids,
+    unsigned int num_vectors,
+    unsigned int num_centroids,
+    unsigned int dim
+) {
+    unsigned int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (c_idx >= num_centroids * dim) return;
+    
+    unsigned int centroid_id = c_idx / dim;
+    unsigned int feature_id = c_idx % dim;
+    
+    // Strided sampling: every N-th vector
+    unsigned int step = num_vectors / num_centroids;
+    if (step == 0) step = 1;
+    unsigned int vec_idx = (centroid_id * step) % num_vectors;
+    
+    centroids[c_idx] = vectors[vec_idx * dim + feature_id];
+}
+
+// GPU kernel to count vectors per cluster (pass 1)
+__global__ void count_cluster_sizes(
+    const int* __restrict__ assignments,
+    int* __restrict__ cluster_sizes,
+    unsigned int num_vectors
+) {
+    unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (vec_idx >= num_vectors) return;
+    
+    int cluster_id = assignments[vec_idx];
+    atomicAdd(&cluster_sizes[cluster_id], 1);
+}
+
+// GPU kernel to build cluster indices (pass 2)
+__global__ void build_cluster_indices(
+    const int* __restrict__ assignments,
+    const int* __restrict__ cluster_offsets,
+    int* __restrict__ cluster_indices,
+    int* __restrict__ cluster_positions,  // Temp array for atomic counters
+    unsigned int num_vectors
+) {
+    unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (vec_idx >= num_vectors) return;
+    
+    int cluster_id = assignments[vec_idx];
+    int offset = cluster_offsets[cluster_id];
+    int pos = atomicAdd(&cluster_positions[cluster_id], 1);
+    
+    cluster_indices[offset + pos] = vec_idx;
+}
+
+} // extern "C"
--- a/tig-algorithms/src/vector_search/mod.rs
+++ b/tig-algorithms/src/vector_search/mod.rs
@ -152,7 +152,8 @@

 // c004_a077

-// c004_a078
+pub mod autovector_native_v2;
+pub use autovector_native_v2 as c004_a078;

 // c004_a079