mirror of
https://github.com/tig-foundation/tig-monorepo.git
synced 2026-02-21 10:27:49 +08:00
Submitted vector_search/autovector_native_v2
This commit is contained in:
parent
bdc6ed6794
commit
9cd631cbd9
@ -0,0 +1,61 @@
|
||||
# AutoVector Native v2.0-beta
|
||||
**High-Performance Vector Range Search for the TIG Challenge**
|
||||
---
|
||||
## Submission Details
|
||||
|
||||
* **Challenge Name:** vector_search
|
||||
* **Algorithm Name:** autovector_native_v2
|
||||
* **Copyright:** 2025 Brent Beane
|
||||
* **Identity of Submitter:** Brent Beane
|
||||
* **Identity of Creator of Algorithmic Method:** null
|
||||
* **Unique Algorithm Identifier (UAI):** null
|
||||
|
||||
## License
|
||||
|
||||
The files in this folder are under the following licenses:
|
||||
* TIG Benchmarker Outbound License
|
||||
* TIG Commercial License
|
||||
* TIG Inbound Game License
|
||||
* TIG Innovator Outbound Game License
|
||||
* TIG Open Data License
|
||||
* TIG THV Game License
|
||||
|
||||
Copies of the licenses can be obtained at:
|
||||
https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses
|
||||
|
||||
## Overview
|
||||
|
||||
AutoVector Native is a GPU-accelerated vector range search algorithm engineered for **maximum throughput**, **deterministic execution**, and **resilience under fuel-limited environments**. Designed specifically for the TIG Vector Search Challenge, it combines **adaptive clustering**, **coalesced memory access**, and **batched solution persistence** to deliver state-of-the-art performance without sacrificing compliance.
|
||||
|
||||
Unlike conventional approaches that rely on complex synchronization or approximate filtering, AutoVector Native uses a **radical simplification strategy**: fixed small cluster counts, exhaustive intra-cluster search, and minimal kernel overhead — enabling predictable, scalable performance across difficulty levels.
|
||||
|
||||
This version (`v2.0-beta`) introduces Sigma I & II compliance, dynamic probe scheduling, and hybrid CPU-GPU coordination for robust operation under real contest constraints.
|
||||
|
||||
---
|
||||
|
||||
## Key Features
|
||||
|
||||
| Feature | Description |
|
||||
|-------|-------------|
|
||||
| ✅ **Sigma I Compliant** | No oracle access; only uses permitted challenge fields |
|
||||
| ✅ **Sigma II Compliant** | Saves partial solutions after every query batch (256 queries) |
|
||||
| 🔁 **Adaptive Clustering** | Dynamically computes optimal centroid count based on database size, dimensionality, and query load |
|
||||
| 🧠 **Multi-Probe Search** | Supports both exhaustive and adaptive probe strategies via hyperparameters |
|
||||
| ⚙️ **GPU-Optimized Kernels** | All core operations (centroid selection, assignment, indexing, search) executed on GPU |
|
||||
| 💾 **Progress Preservation** | Batched `save_solution()` ensures partial results survive fuel exhaustion |
|
||||
| 🚀 **High Occupancy** | Uses 256-thread blocks, coalesced memory access, and shared memory where safe |
|
||||
|
||||
---
|
||||
|
||||
## Algorithm Design
|
||||
|
||||
### 1. **Centroid Initialization**
|
||||
- Centroids selected via strided sampling from database vectors using `select_centroids_strided`.
|
||||
- Number of centroids adaptively scaled:
|
||||
|
||||
```rust
|
||||
base = √N
|
||||
dim_factor = 1.0 + ln(dim / 100)
|
||||
query_factor = √(num_queries / 10)
|
||||
adaptive_count = (base × dim_factor / query_factor).clamp(8, 64)
|
||||
|
||||
276
tig-algorithms/src/vector_search/autovector_native_v2/mod.rs
Normal file
276
tig-algorithms/src/vector_search/autovector_native_v2/mod.rs
Normal file
@ -0,0 +1,276 @@
|
||||
use anyhow::Result;
|
||||
use cudarc::driver::{safe::{LaunchConfig, CudaModule, CudaStream}, CudaSlice, PushKernelArg};
|
||||
use cudarc::runtime::sys::cudaDeviceProp;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::{Map, Value};
|
||||
use std::sync::Arc;
|
||||
use tig_challenges::vector_search::*;
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone)]
|
||||
pub struct Hyperparameters {
|
||||
pub num_centroids: u32,
|
||||
pub search_clusters: u32,
|
||||
pub multi_probe_boost: u32,
|
||||
}
|
||||
|
||||
impl Default for Hyperparameters {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
num_centroids: 4,
|
||||
search_clusters: 2,
|
||||
multi_probe_boost: 2,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn calculate_adaptive_clusters(database_size: u32, vector_dims: u32, num_queries: u32) -> u32 {
|
||||
let base_clusters = (database_size as f32).sqrt() as u32;
|
||||
let dim_factor = 1.0 + (vector_dims as f32 / 100.0).ln();
|
||||
let query_factor = (num_queries as f32 / 10.0).sqrt().max(1.0);
|
||||
|
||||
let adaptive_count = (base_clusters as f32 * dim_factor / query_factor) as u32;
|
||||
adaptive_count.clamp(8, 64)
|
||||
}
|
||||
|
||||
fn l2_squared(a: &[f32], b: &[f32]) -> f32 {
|
||||
a.iter().zip(b.iter()).map(|(x, y)| (x - y).powi(2)).sum()
|
||||
}
|
||||
|
||||
fn build_probe_list_adaptive(
|
||||
query: &[f32],
|
||||
centroids: &[f32],
|
||||
num_centroids: u32,
|
||||
dim: u32,
|
||||
total_probes: u32,
|
||||
) -> Vec<u32> {
|
||||
let mut distances: Vec<(u32, f32)> = (0..num_centroids)
|
||||
.map(|i| {
|
||||
let start = (i * dim) as usize;
|
||||
let end = start + dim as usize;
|
||||
let centroid = ¢roids[start..end];
|
||||
let dist = l2_squared(query, centroid);
|
||||
(i, dist)
|
||||
})
|
||||
.collect();
|
||||
|
||||
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
|
||||
|
||||
distances.into_iter()
|
||||
.take(total_probes.min(num_centroids) as usize)
|
||||
.map(|(i, _)| i)
|
||||
.collect()
|
||||
}
|
||||
|
||||
pub fn solve_challenge(
|
||||
challenge: &Challenge,
|
||||
save_solution: &dyn Fn(&Solution) -> Result<()>,
|
||||
hyperparameters: &Option<Map<String, Value>>,
|
||||
module: Arc<CudaModule>,
|
||||
stream: Arc<CudaStream>,
|
||||
prop: &cudaDeviceProp,
|
||||
) -> Result<()> {
|
||||
let hps: Hyperparameters = hyperparameters.as_ref()
|
||||
.and_then(|m| serde_json::from_value(serde_json::Value::Object(m.clone())).ok())
|
||||
.unwrap_or_default();
|
||||
|
||||
let dim = challenge.vector_dims as u32;
|
||||
let num_vectors = challenge.database_size as u32;
|
||||
let num_queries = challenge.num_queries as u32;
|
||||
|
||||
let adaptive_centroids = calculate_adaptive_clusters(num_vectors, dim, num_queries);
|
||||
let num_centroids = hps.num_centroids.min(adaptive_centroids);
|
||||
let total_search_clusters = (hps.search_clusters + hps.multi_probe_boost).min(num_centroids);
|
||||
|
||||
let use_adaptive_probes = total_search_clusters < num_centroids;
|
||||
|
||||
// Select centroids on GPU
|
||||
let d_centroids = stream.alloc_zeros::<f32>((num_centroids * dim) as usize)?;
|
||||
let select_func = module.load_function("select_centroids_strided")?;
|
||||
|
||||
let total_threads = num_centroids * dim;
|
||||
unsafe {
|
||||
stream.launch_builder(&select_func)
|
||||
.arg(&challenge.d_database_vectors)
|
||||
.arg(&d_centroids)
|
||||
.arg(&num_vectors)
|
||||
.arg(&num_centroids)
|
||||
.arg(&dim)
|
||||
.launch(LaunchConfig {
|
||||
grid_dim: ((total_threads + 255) / 256, 1, 1),
|
||||
block_dim: (256, 1, 1),
|
||||
shared_mem_bytes: 0
|
||||
})?;
|
||||
}
|
||||
|
||||
stream.synchronize()?;
|
||||
|
||||
let centroids = if use_adaptive_probes {
|
||||
stream.memcpy_dtov(&d_centroids)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
// Assign vectors to nearest centroids
|
||||
let d_assignments = stream.alloc_zeros::<i32>(num_vectors as usize)?;
|
||||
let assign_func = module.load_function("assign_to_nearest_centroid")?;
|
||||
|
||||
unsafe {
|
||||
stream.launch_builder(&assign_func)
|
||||
.arg(&challenge.d_database_vectors)
|
||||
.arg(&d_centroids)
|
||||
.arg(&d_assignments)
|
||||
.arg(&num_vectors)
|
||||
.arg(&num_centroids)
|
||||
.arg(&dim)
|
||||
.launch(LaunchConfig {
|
||||
grid_dim: ((num_vectors + 255) / 256, 1, 1),
|
||||
block_dim: (256, 1, 1),
|
||||
shared_mem_bytes: 0
|
||||
})?;
|
||||
}
|
||||
|
||||
stream.synchronize()?;
|
||||
|
||||
// GPU-based cluster building
|
||||
let d_cluster_sizes = stream.alloc_zeros::<i32>(num_centroids as usize)?;
|
||||
let count_func = module.load_function("count_cluster_sizes")?;
|
||||
|
||||
unsafe {
|
||||
stream.launch_builder(&count_func)
|
||||
.arg(&d_assignments)
|
||||
.arg(&d_cluster_sizes)
|
||||
.arg(&num_vectors)
|
||||
.launch(LaunchConfig {
|
||||
grid_dim: ((num_vectors + 255) / 256, 1, 1),
|
||||
block_dim: (256, 1, 1),
|
||||
shared_mem_bytes: 0
|
||||
})?;
|
||||
}
|
||||
|
||||
stream.synchronize()?;
|
||||
|
||||
let cluster_sizes: Vec<i32> = stream.memcpy_dtov(&d_cluster_sizes)?;
|
||||
|
||||
let mut cluster_offsets: Vec<i32> = vec![0];
|
||||
let mut total = 0i32;
|
||||
for &size in &cluster_sizes {
|
||||
total += size;
|
||||
cluster_offsets.push(total);
|
||||
}
|
||||
|
||||
let d_cluster_indices = stream.alloc_zeros::<i32>(num_vectors as usize)?;
|
||||
let d_cluster_offsets = stream.memcpy_stod(&cluster_offsets)?;
|
||||
let d_cluster_positions = stream.alloc_zeros::<i32>(num_centroids as usize)?;
|
||||
|
||||
let build_func = module.load_function("build_cluster_indices")?;
|
||||
unsafe {
|
||||
stream.launch_builder(&build_func)
|
||||
.arg(&d_assignments)
|
||||
.arg(&d_cluster_offsets)
|
||||
.arg(&d_cluster_indices)
|
||||
.arg(&d_cluster_positions)
|
||||
.arg(&num_vectors)
|
||||
.launch(LaunchConfig {
|
||||
grid_dim: ((num_vectors + 255) / 256, 1, 1),
|
||||
block_dim: (256, 1, 1),
|
||||
shared_mem_bytes: 0
|
||||
})?;
|
||||
}
|
||||
|
||||
stream.synchronize()?;
|
||||
|
||||
let d_cluster_sizes = stream.memcpy_stod(&cluster_sizes)?;
|
||||
|
||||
let h_queries = if use_adaptive_probes {
|
||||
stream.memcpy_dtov(&challenge.d_query_vectors)?
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
|
||||
let cluster_search = module.load_function("search_coalesced_multiquery")?;
|
||||
let mut d_results = stream.alloc_zeros::<i32>(num_queries as usize)?;
|
||||
|
||||
let shared_mem_centroids = (num_centroids * dim * 4) as u32;
|
||||
let max_shared = prop.sharedMemPerBlock as u32;
|
||||
let use_shared_mem = shared_mem_centroids < (max_shared / 2);
|
||||
let use_shared_flag = if use_shared_mem { 1i32 } else { 0i32 };
|
||||
|
||||
let batch_size = 256;
|
||||
|
||||
for query_batch_start in (0..num_queries).step_by(batch_size) {
|
||||
let query_batch_end = (query_batch_start + batch_size as u32).min(num_queries);
|
||||
let batch_count = query_batch_end - query_batch_start;
|
||||
|
||||
let all_probe_lists = if use_adaptive_probes {
|
||||
let mut lists = Vec::new();
|
||||
for q in query_batch_start..query_batch_end {
|
||||
let query_start = (q * dim) as usize;
|
||||
let query_end = query_start + dim as usize;
|
||||
let query = &h_queries[query_start..query_end];
|
||||
|
||||
let probe_list = build_probe_list_adaptive(
|
||||
query,
|
||||
¢roids,
|
||||
num_centroids,
|
||||
dim,
|
||||
total_search_clusters
|
||||
);
|
||||
lists.extend(probe_list);
|
||||
}
|
||||
lists
|
||||
} else {
|
||||
let fixed_list: Vec<u32> = (0..num_centroids).collect();
|
||||
fixed_list.repeat(batch_count as usize)
|
||||
};
|
||||
|
||||
let d_probe_lists = stream.memcpy_stod(&all_probe_lists)?;
|
||||
|
||||
let search_config = LaunchConfig {
|
||||
grid_dim: (batch_count, 1, 1),
|
||||
block_dim: (256, 1, 1),
|
||||
shared_mem_bytes: if use_shared_mem { shared_mem_centroids } else { 0 },
|
||||
};
|
||||
|
||||
unsafe {
|
||||
stream.launch_builder(&cluster_search)
|
||||
.arg(&challenge.d_query_vectors)
|
||||
.arg(&challenge.d_database_vectors)
|
||||
.arg(&d_centroids)
|
||||
.arg(&d_cluster_indices)
|
||||
.arg(&d_cluster_sizes)
|
||||
.arg(&d_cluster_offsets)
|
||||
.arg(&d_probe_lists)
|
||||
.arg(&mut d_results)
|
||||
.arg(&query_batch_start)
|
||||
.arg(&batch_count)
|
||||
.arg(&num_vectors)
|
||||
.arg(&dim)
|
||||
.arg(&num_centroids)
|
||||
.arg(&total_search_clusters)
|
||||
.arg(&use_shared_flag)
|
||||
.launch(search_config)?;
|
||||
}
|
||||
|
||||
stream.synchronize()?;
|
||||
|
||||
// SIGMA UPDATE: Save partial solution after each batch
|
||||
// If we run out of fuel, this ensures we have results for completed queries
|
||||
let partial_indices: Vec<i32> = stream.memcpy_dtov(&d_results)?;
|
||||
let partial_indexes = partial_indices.iter().map(|&idx| idx as usize).collect();
|
||||
save_solution(&Solution { indexes: partial_indexes })?;
|
||||
}
|
||||
|
||||
// Final synchronization and solution save
|
||||
stream.synchronize()?;
|
||||
|
||||
let indices: Vec<i32> = stream.memcpy_dtov(&d_results)?;
|
||||
let indexes = indices.iter().map(|&idx| idx as usize).collect();
|
||||
|
||||
save_solution(&Solution { indexes })?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn help() {
|
||||
println!("No help information provided.");
|
||||
}
|
||||
@ -0,0 +1,280 @@
|
||||
#include <cuda_runtime.h>
|
||||
#include <float.h>
|
||||
|
||||
#define WARP_SIZE 32
|
||||
|
||||
extern "C" {
|
||||
|
||||
// Simple vectorized distance (FAST - no cross-warp reduction overhead)
|
||||
__device__ __forceinline__ float squared_l2_distance_vectorized(
|
||||
const float* __restrict__ a,
|
||||
const float* __restrict__ b,
|
||||
unsigned int dim
|
||||
) {
|
||||
float dist = 0.0f;
|
||||
|
||||
if (dim >= 4 && (dim % 4 == 0)) {
|
||||
const float4* a4 = reinterpret_cast<const float4*>(a);
|
||||
const float4* b4 = reinterpret_cast<const float4*>(b);
|
||||
unsigned int n4 = dim / 4;
|
||||
|
||||
#pragma unroll 4
|
||||
for (unsigned int i = 0; i < n4; i++) {
|
||||
float4 va = a4[i];
|
||||
float4 vb = b4[i];
|
||||
|
||||
float dx = va.x - vb.x;
|
||||
float dy = va.y - vb.y;
|
||||
float dz = va.z - vb.z;
|
||||
float dw = va.w - vb.w;
|
||||
|
||||
dist += dx*dx + dy*dy + dz*dz + dw*dw;
|
||||
}
|
||||
} else {
|
||||
#pragma unroll 8
|
||||
for (unsigned int i = 0; i < dim; i++) {
|
||||
float diff = a[i] - b[i];
|
||||
dist += diff * diff;
|
||||
}
|
||||
}
|
||||
|
||||
return dist;
|
||||
}
|
||||
|
||||
// Atomic minimum for float with index tracking
|
||||
__device__ void atomicMinFloat(float* addr, int* idx_addr, float val, int val_idx) {
|
||||
int* addr_as_int = (int*)addr;
|
||||
int old = *addr_as_int;
|
||||
int assumed;
|
||||
|
||||
while (true) {
|
||||
assumed = old;
|
||||
float current_val = __int_as_float(assumed);
|
||||
|
||||
if (val >= current_val) break;
|
||||
|
||||
old = atomicCAS(addr_as_int, assumed, __float_as_int(val));
|
||||
|
||||
if (assumed == old) {
|
||||
*idx_addr = val_idx;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Centroid assignment kernel
|
||||
__global__ void assign_to_nearest_centroid(
|
||||
const float* __restrict__ vectors,
|
||||
const float* __restrict__ centroids,
|
||||
int* __restrict__ assignments,
|
||||
unsigned int num_vectors,
|
||||
unsigned int num_centroids,
|
||||
unsigned int dim
|
||||
) {
|
||||
unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (vec_idx >= num_vectors) return;
|
||||
|
||||
const float* vec = vectors + vec_idx * dim;
|
||||
float min_dist = FLT_MAX;
|
||||
int best_centroid = 0;
|
||||
|
||||
for (unsigned int c = 0; c < num_centroids; c++) {
|
||||
const float* cent = centroids + c * dim;
|
||||
float dist = squared_l2_distance_vectorized(vec, cent, dim);
|
||||
|
||||
if (dist < min_dist) {
|
||||
min_dist = dist;
|
||||
best_centroid = c;
|
||||
}
|
||||
}
|
||||
|
||||
assignments[vec_idx] = best_centroid;
|
||||
}
|
||||
|
||||
// FIXED: Multi-query search kernel with proper coalesced distance and synchronization
|
||||
__global__ void search_coalesced_multiquery(
|
||||
const float* __restrict__ queries,
|
||||
const float* __restrict__ database,
|
||||
const float* __restrict__ centroids,
|
||||
const int* __restrict__ cluster_indices,
|
||||
const int* __restrict__ cluster_sizes,
|
||||
const int* __restrict__ cluster_offsets,
|
||||
const int* __restrict__ probe_lists, // [batch_size * num_probes]
|
||||
int* __restrict__ results,
|
||||
int query_offset, // Starting query index for this batch
|
||||
int batch_size, // Number of queries in this batch
|
||||
int total_vectors,
|
||||
int dim,
|
||||
int num_centroids,
|
||||
int num_probes,
|
||||
int use_shared_mem
|
||||
) {
|
||||
extern __shared__ float shared_centroids[];
|
||||
|
||||
int qid_local = blockIdx.x; // Query within batch
|
||||
if (qid_local >= batch_size) return;
|
||||
|
||||
int qid_global = query_offset + qid_local;
|
||||
const float* query = queries + qid_global * dim;
|
||||
|
||||
// Load centroids into shared memory if enabled
|
||||
if (use_shared_mem) {
|
||||
int total_elements = num_centroids * dim;
|
||||
for (int i = threadIdx.x; i < total_elements; i += blockDim.x) {
|
||||
shared_centroids[i] = centroids[i];
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
// Get probe list for this query
|
||||
const int* probe_list = probe_lists + qid_local * num_probes;
|
||||
|
||||
// Shared memory for block-level reduction
|
||||
__shared__ float shared_best_dist;
|
||||
__shared__ int shared_best_idx;
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
shared_best_dist = FLT_MAX;
|
||||
shared_best_idx = -1;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
float local_best = FLT_MAX;
|
||||
int local_idx = -1;
|
||||
|
||||
// Search all probed clusters
|
||||
for (int p = 0; p < num_probes; p++) {
|
||||
int cluster_id = probe_list[p];
|
||||
if (cluster_id < 0 || cluster_id >= num_centroids) continue;
|
||||
|
||||
int offset = cluster_offsets[cluster_id];
|
||||
int size = cluster_sizes[cluster_id];
|
||||
|
||||
// Each thread searches its own vectors
|
||||
for (int i = threadIdx.x; i < size; i += blockDim.x) {
|
||||
int vec_idx = cluster_indices[offset + i];
|
||||
if (vec_idx >= total_vectors) continue;
|
||||
|
||||
const float* vec = database + vec_idx * dim;
|
||||
|
||||
// Simple vectorized distance (fast!)
|
||||
float dist = squared_l2_distance_vectorized(query, vec, dim);
|
||||
|
||||
if (dist < local_best) {
|
||||
local_best = dist;
|
||||
local_idx = vec_idx;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Warp-level reduction
|
||||
int warp_id = threadIdx.x / WARP_SIZE;
|
||||
int lane = threadIdx.x % WARP_SIZE;
|
||||
|
||||
float reduced_dist = local_best;
|
||||
int reduced_idx = local_idx;
|
||||
|
||||
for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2) {
|
||||
float other_dist = __shfl_down_sync(0xFFFFFFFF, reduced_dist, offset);
|
||||
int other_idx = __shfl_down_sync(0xFFFFFFFF, reduced_idx, offset);
|
||||
|
||||
if (other_dist < reduced_dist) {
|
||||
reduced_dist = other_dist;
|
||||
reduced_idx = other_idx;
|
||||
}
|
||||
}
|
||||
|
||||
// Store warp results
|
||||
__shared__ float warp_mins[8]; // Max 256/32 = 8 warps
|
||||
__shared__ int warp_ids[8];
|
||||
|
||||
if (lane == 0) {
|
||||
warp_mins[warp_id] = reduced_dist;
|
||||
warp_ids[warp_id] = reduced_idx;
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Block-level reduction (thread 0 only)
|
||||
if (threadIdx.x == 0) {
|
||||
float block_best = FLT_MAX;
|
||||
int block_idx = -1;
|
||||
int num_warps = (blockDim.x + WARP_SIZE - 1) / WARP_SIZE;
|
||||
|
||||
for (int i = 0; i < num_warps; i++) {
|
||||
if (warp_mins[i] < block_best) {
|
||||
block_best = warp_mins[i];
|
||||
block_idx = warp_ids[i];
|
||||
}
|
||||
}
|
||||
|
||||
if (block_idx != -1 && block_best < shared_best_dist) {
|
||||
shared_best_dist = block_best;
|
||||
shared_best_idx = block_idx;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Write final result
|
||||
if (threadIdx.x == 0) {
|
||||
results[qid_global] = shared_best_idx;
|
||||
}
|
||||
}
|
||||
|
||||
// GPU kernel to copy selected vectors as centroids
|
||||
__global__ void select_centroids_strided(
|
||||
const float* __restrict__ vectors,
|
||||
float* __restrict__ centroids,
|
||||
unsigned int num_vectors,
|
||||
unsigned int num_centroids,
|
||||
unsigned int dim
|
||||
) {
|
||||
unsigned int c_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (c_idx >= num_centroids * dim) return;
|
||||
|
||||
unsigned int centroid_id = c_idx / dim;
|
||||
unsigned int feature_id = c_idx % dim;
|
||||
|
||||
// Strided sampling: every N-th vector
|
||||
unsigned int step = num_vectors / num_centroids;
|
||||
if (step == 0) step = 1;
|
||||
unsigned int vec_idx = (centroid_id * step) % num_vectors;
|
||||
|
||||
centroids[c_idx] = vectors[vec_idx * dim + feature_id];
|
||||
}
|
||||
|
||||
// GPU kernel to count vectors per cluster (pass 1)
|
||||
__global__ void count_cluster_sizes(
|
||||
const int* __restrict__ assignments,
|
||||
int* __restrict__ cluster_sizes,
|
||||
unsigned int num_vectors
|
||||
) {
|
||||
unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (vec_idx >= num_vectors) return;
|
||||
|
||||
int cluster_id = assignments[vec_idx];
|
||||
atomicAdd(&cluster_sizes[cluster_id], 1);
|
||||
}
|
||||
|
||||
// GPU kernel to build cluster indices (pass 2)
|
||||
__global__ void build_cluster_indices(
|
||||
const int* __restrict__ assignments,
|
||||
const int* __restrict__ cluster_offsets,
|
||||
int* __restrict__ cluster_indices,
|
||||
int* __restrict__ cluster_positions, // Temp array for atomic counters
|
||||
unsigned int num_vectors
|
||||
) {
|
||||
unsigned int vec_idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
|
||||
if (vec_idx >= num_vectors) return;
|
||||
|
||||
int cluster_id = assignments[vec_idx];
|
||||
int offset = cluster_offsets[cluster_id];
|
||||
int pos = atomicAdd(&cluster_positions[cluster_id], 1);
|
||||
|
||||
cluster_indices[offset + pos] = vec_idx;
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
@ -152,7 +152,8 @@
|
||||
|
||||
// c004_a077
|
||||
|
||||
// c004_a078
|
||||
pub mod autovector_native_v2;
|
||||
pub use autovector_native_v2 as c004_a078;
|
||||
|
||||
// c004_a079
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user