Submitted vector_search/fast_search_cuda

2026-03-05 00:08:13 +08:00 · 2025-12-02 16:54:11 +00:00 · 2025-12-02 16:54:11 +00:00 · 0c8aaf1cd2
commit 0c8aaf1cd2
parent bdc6ed6794
4 changed files with 267 additions and 1 deletions
--- a/tig-algorithms/src/vector_search/fast_search_cuda/README.md
+++ b/tig-algorithms/src/vector_search/fast_search_cuda/README.md
@ -0,0 +1,23 @@
+# TIG Code Submission
+
+## Submission Details
+
+* **Challenge Name:** vector_search
+* **Algorithm Name:** fast_search_cuda
+* **Copyright:** 2024 Cortex & Haz
+* **Identity of Submitter:** Cortex & Haz
+* **Identity of Creator of Algorithmic Method:** null
+* **Unique Algorithm Identifier (UAI):** null
+
+## License
+
+The files in this folder are under the following licenses:
+* TIG Benchmarker Outbound License
+* TIG Commercial License
+* TIG Inbound Game License
+* TIG Innovator Outbound Game License
+* TIG Open Data License
+* TIG THV Game License
+
+Copies of the licenses can be obtained at:
+https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses
--- a/tig-algorithms/src/vector_search/fast_search_cuda/kernels.cu
+++ b/tig-algorithms/src/vector_search/fast_search_cuda/kernels.cu
@ -0,0 +1,19 @@
+/*!
+Copyright 2024 Cortex & Haz
+
+Licensed under the TIG Inbound Game License v1.0 or (at your option) any later
+version (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the specific
+language governing permissions and limitations under the License.
+ */
+
+extern "C" __global__ void do_nothing()
+{
+    // This kernel does nothing
+}             
--- a/tig-algorithms/src/vector_search/fast_search_cuda/mod.rs
+++ b/tig-algorithms/src/vector_search/fast_search_cuda/mod.rs
@ -0,0 +1,223 @@
+use anyhow::{anyhow, Result};
+use cudarc::{
+    driver::{safe::LaunchConfig, CudaModule, CudaStream, PushKernelArg},
+    runtime::sys::cudaDeviceProp,
+};
+use serde_json::{Map, Value};
+use std::sync::Arc;
+use tig_challenges::vector_search::{Challenge, Solution};
+
+pub fn solve_challenge(
+    challenge: &Challenge,
+    save_solution: &dyn Fn(&Solution) -> anyhow::Result<()>,
+    hyperparameters: &Option<Map<String, Value>>,
+    module: Arc<CudaModule>,
+    stream: Arc<CudaStream>,
+    prop: &cudaDeviceProp,
+) -> anyhow::Result<()> {
+    Err(anyhow!("This algorithm is no longer compatible."))
+}
+
+// Old code that is no longer compatible
+#[cfg(none)]
+mod dead_code {
+    use anyhow::Result;
+    use tig_challenges::vector_search::{Challenge, Solution};
+
+    fn squared_distance(v1: &[f32], v2: &[f32]) -> f32 {
+        v1.iter()
+            .zip(v2.iter())
+            .map(|(a, b)| (a - b) * (a - b))
+            .sum()
+    }
+
+    pub fn solve_challenge(challenge: &Challenge) -> Result<Option<Solution>> {
+        let max_distance_sq = challenge.max_distance * challenge.max_distance;
+
+        let indexes: Vec<usize> = challenge
+            .query_vectors
+            .iter()
+            .filter_map(|query| {
+                challenge
+                    .vector_database
+                    .iter()
+                    .enumerate()
+                    .find_map(|(i, vector)| {
+                        if squared_distance(query, vector) <= max_distance_sq {
+                            Some(i)
+                        } else {
+                            None
+                        }
+                    })
+            })
+            .collect();
+
+        if indexes.len() == challenge.query_vectors.len() {
+            Ok(Some(Solution { indexes }))
+        } else {
+            Ok(None)
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    mod gpu_optimisation {
+        use super::*;
+        use cudarc::driver::*;
+        use std::{collections::HashMap, ptr, sync::Arc, time::Instant};
+        use tig_challenges::CudaKernel;
+
+        const BLOCK_SIZE: u32 = 256;
+
+        pub const KERNEL: Option<CudaKernel> = Some(CudaKernel {
+            src: r#"
+   #define INT_MAX     (2147483647U)
+   #define FLT_MAX     (3.402823466e+38F)
+
+   __device__ float squared_distance(const float * v1, const float * v2, int dim) {
+       float sum = 0.0f;
+       for (int i = 0; i < dim; i++) {
+           float diff = v1[i] - v2[i];
+           sum += diff * diff;
+       }
+       return sum;
+   }
+
+   extern "C" __global__ void fast_search(
+       const float * query_vectors,
+       const float * vector_database,
+       int query_count,
+       int vector_count,
+       int vector_dim,
+       float max_distance_sq,
+       int shared_mem_size,
+       int * result_indexes
+   ) {
+       extern __shared__ int sh_mem[];
+       int queryIdx = blockIdx.x;
+       int thIdx = threadIdx.x;
+       int * sh_indexes = sh_mem;
+       assert(shared_mem_size >= sizeof(sh_indexes[0]) * blockDim.x);
+
+       if (queryIdx >= query_count) {
+           return;
+       } 
+
+       const float * query = &query_vectors[queryIdx * vector_dim];
+       float minDistance = FLT_MAX;
+       int minIndex = -1;
+       for (int i = 0; i < vector_count; i += blockDim.x) {
+           int vectorIdx = i + thIdx;
+           const float * vector = &vector_database[vectorIdx * vector_dim];
+           float distance = squared_distance(query, vector, vector_dim);
+           if ((distance < minDistance) && (distance <= max_distance_sq)) {
+               minDistance = distance;
+               minIndex = vectorIdx;
+               break;
+           }
+       }
+
+       sh_indexes[thIdx] = minIndex;
+       __syncthreads();
+
+       if (thIdx == 0) {
+           minIndex = INT_MAX;
+           for (int i = 0; i < blockDim.x; i++) {
+               if ((sh_indexes[i] > -1) && (sh_indexes[i] < minIndex)) {
+                   minIndex = sh_indexes[i];
+               }
+           }
+           result_indexes[queryIdx] = minIndex;
+       }
+   }
+           "#,
+            funcs: &["fast_search"],
+        });
+
+        fn flatten_vector_database(database: &[Vec<f32>]) -> Vec<f32> {
+            let total_elements = database.len() * database[0].len();
+            let mut database_flat: Vec<f32> = Vec::with_capacity(total_elements);
+            let v_len = database[0].len();
+            let mut offset = 0;
+            unsafe {
+                database_flat.set_len(total_elements);
+                for v in database.iter() {
+                    let src = v.as_ptr();
+                    let dest = database_flat.as_mut_ptr().add(offset);
+                    ptr::copy_nonoverlapping(src, dest, v_len);
+                    offset += v_len;
+                }
+            }
+
+            database_flat
+        }
+
+        pub fn cuda_solve_challenge(
+            challenge: &Challenge,
+            dev: &Arc<CudaDevice>,
+            mut funcs: HashMap<&'static str, CudaFunction>,
+        ) -> anyhow::Result<Option<Solution>> {
+            let vector_count = challenge.vector_database.len();
+            let query_count = challenge.query_vectors.len();
+            let vector_dim = challenge.query_vectors[0].len();
+            let max_distance_sq = challenge.max_distance * challenge.max_distance;
+
+            /* Prepare databases */
+            let vector_database_flat = flatten_vector_database(&challenge.vector_database);
+            let query_vectors_flat = flatten_vector_database(&challenge.query_vectors);
+
+            /* Copy data to GPU */
+            let d_vector_database = dev.htod_copy(vector_database_flat)?;
+            let d_query_vectors = dev.htod_copy(query_vectors_flat)?;
+            let d_result_indexes = dev.alloc_zeros::<i32>(query_count)?;
+            dev.synchronize()?;
+
+            /* Fire Kernel */
+            unsafe {
+                const SHARED_MEM_SIZE: u32 = BLOCK_SIZE * std::mem::size_of::<i32>() as u32 * 2;
+                funcs.remove("fast_search").unwrap().launch(
+                    LaunchConfig {
+                        grid_dim: (query_count as u32, 1, 1),
+                        block_dim: (BLOCK_SIZE, 1, 1),
+                        shared_mem_bytes: SHARED_MEM_SIZE,
+                    },
+                    (
+                        &d_query_vectors,
+                        &d_vector_database,
+                        query_count as i32,
+                        vector_count as i32,
+                        vector_dim as i32,
+                        max_distance_sq,
+                        SHARED_MEM_SIZE as i32,
+                        &d_result_indexes,
+                    ),
+                )?;
+            }
+
+            /* Get data from kernel */
+            let result_indexes = dev.dtoh_sync_copy(&d_result_indexes)?;
+            let indexes: Vec<usize> = result_indexes
+                .into_iter()
+                .filter_map(|i| {
+                    if i != std::i32::MAX {
+                        Some(i as usize)
+                    } else {
+                        None
+                    }
+                })
+                .collect();
+
+            if indexes.len() == query_count {
+                Ok(Some(Solution { indexes }))
+            } else {
+                Ok(None)
+            }
+        }
+    }
+
+    #[cfg(feature = "cuda")]
+    pub use gpu_optimisation::{cuda_solve_challenge, KERNEL};
+}
+
+pub fn help() {
+    println!("No help information available.");
+}
--- a/tig-algorithms/src/vector_search/mod.rs
+++ b/tig-algorithms/src/vector_search/mod.rs
@ -56,7 +56,8 @@

 // c004_a029

-// c004_a030
+pub mod fast_search_cuda;
+pub use fast_search_cuda as c004_a030;

 // c004_a031