From d10de9c07abf7f003207d5c7f24955dbe95fed60 Mon Sep 17 00:00:00 2001 From: 0x8e671c8f04fdcd2a519c984784a136594d5e8cd2 Date: Mon, 19 Jan 2026 22:15:05 +0000 Subject: [PATCH] "Player 0x8e671c8f04fdcd2a519c984784a136594d5e8cd2 submitted code hypergraph/freud_opt" --- .../src/hypergraph/freud_opt/README.md | 23 + .../src/hypergraph/freud_opt/freud_opt.cu | 483 +++++++++++++ .../src/hypergraph/freud_opt/mod.rs | 654 ++++++++++++++++++ tig-algorithms/src/hypergraph/mod.rs | 3 +- 4 files changed, 1162 insertions(+), 1 deletion(-) create mode 100644 tig-algorithms/src/hypergraph/freud_opt/README.md create mode 100644 tig-algorithms/src/hypergraph/freud_opt/freud_opt.cu create mode 100644 tig-algorithms/src/hypergraph/freud_opt/mod.rs diff --git a/tig-algorithms/src/hypergraph/freud_opt/README.md b/tig-algorithms/src/hypergraph/freud_opt/README.md new file mode 100644 index 00000000..8cbcdb7d --- /dev/null +++ b/tig-algorithms/src/hypergraph/freud_opt/README.md @@ -0,0 +1,23 @@ +# TIG Code Submission + +## Submission Details + +* **Challenge Name:** hypergraph +* **Algorithm Name:** freud_opt +* **Copyright:** 2025 ChervovNikita +* **Identity of Submitter:** ChervovNikita +* **Identity of Creator of Algorithmic Method:** null +* **Unique Algorithm Identifier (UAI):** null + +## License + +The files in this folder are under the following licenses: +* TIG Benchmarker Outbound License +* TIG Commercial License +* TIG Inbound Game License +* TIG Innovator Outbound Game License +* TIG Open Data License +* TIG THV Game License + +Copies of the licenses can be obtained at: +https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses \ No newline at end of file diff --git a/tig-algorithms/src/hypergraph/freud_opt/freud_opt.cu b/tig-algorithms/src/hypergraph/freud_opt/freud_opt.cu new file mode 100644 index 00000000..6c162648 --- /dev/null +++ b/tig-algorithms/src/hypergraph/freud_opt/freud_opt.cu @@ -0,0 +1,483 @@ +#include +#include + +extern "C" __global__ void hyperedge_clustering( + const int num_hyperedges, + const int num_clusters, + const int *hyperedge_offsets, + int *hyperedge_clusters +) { + int hedge = blockIdx.x * blockDim.x + threadIdx.x; + + if (hedge < num_hyperedges) { + int start = hyperedge_offsets[hedge]; + int end = hyperedge_offsets[hedge + 1]; + int hedge_size = end - start; + + int quarter_clusters = num_clusters >> 2; + int cluster_mask = quarter_clusters - 1; + + int bucket = (hedge_size > 8) ? 3 : + (hedge_size > 4) ? 2 : + (hedge_size > 2) ? 1 : 0; + int cluster = bucket * quarter_clusters + (hedge & cluster_mask); + + hyperedge_clusters[hedge] = cluster; + } +} + +extern "C" __global__ void compute_node_preferences( + const int num_nodes, + const int num_parts, + const int num_hedge_clusters, + const int *node_hyperedges, + const int *node_offsets, + const int *hyperedge_clusters, + const int *hyperedge_offsets, + int *pref_parts, + int *pref_priorities +) { + int node = blockIdx.x * blockDim.x + threadIdx.x; + + if (node < num_nodes) { + int start = node_offsets[node]; + int end = node_offsets[node + 1]; + int node_degree = end - start; + + int cluster_votes[64]; + int max_clusters = min(num_hedge_clusters, 64); + for (int i = 0; i < max_clusters; i++) { + cluster_votes[i] = 0; + } + + int max_votes = 0; + int best_cluster = 0; + + for (int j = start; j < end; j++) { + int hyperedge = node_hyperedges[j]; + int cluster = hyperedge_clusters[hyperedge]; + + if (cluster >= 0 && cluster < max_clusters) { + int hedge_start = hyperedge_offsets[hyperedge]; + int hedge_end = hyperedge_offsets[hyperedge + 1]; + int hedge_size = hedge_end - hedge_start; + int weight = (hedge_size <= 2) ? 6 : + (hedge_size <= 4) ? 4 : + (hedge_size <= 8) ? 2 : 1; + + cluster_votes[cluster] += weight; + + if (cluster_votes[cluster] > max_votes || + (cluster_votes[cluster] == max_votes && cluster < best_cluster)) { + max_votes = cluster_votes[cluster]; + best_cluster = cluster; + } + } + } + + int base_part = (num_parts > 0) ? (best_cluster % num_parts) : 0; + int target_partition = base_part; + + pref_parts[node] = target_partition; + int degree_weight = node_degree > 255 ? 255 : node_degree; + pref_priorities[node] = (max_votes << 16) + (degree_weight << 8) + (num_parts - (node % num_parts)); + } +} + +extern "C" __global__ void execute_node_assignments( + const int num_nodes, + const int num_parts, + const int max_part_size, + const int *sorted_nodes, + const int *sorted_parts, + int *partition, + int *nodes_in_part +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (int i = 0; i < num_nodes; i++) { + int node = sorted_nodes[i]; + int preferred_part = sorted_parts[i]; + + if (node >= 0 && node < num_nodes && preferred_part >= 0 && preferred_part < num_parts) { + bool assigned = false; + for (int attempt = 0; attempt < num_parts; attempt++) { + int try_part = (preferred_part + attempt) % num_parts; + if (nodes_in_part[try_part] < max_part_size) { + partition[node] = try_part; + nodes_in_part[try_part]++; + assigned = true; + break; + } + } + + if (!assigned) { + int fallback_part = node % num_parts; + partition[node] = fallback_part; + nodes_in_part[fallback_part]++; + } + } + } + } +} + +extern "C" __global__ void precompute_edge_flags( + const int num_hyperedges, + const int num_nodes, + const int *hyperedge_nodes, + const int *hyperedge_offsets, + const int *partition, + unsigned long long *edge_flags_all, + unsigned long long *edge_flags_double +) { + int hedge = blockIdx.x * blockDim.x + threadIdx.x; + + if (hedge < num_hyperedges) { + int start = hyperedge_offsets[hedge]; + int end = hyperedge_offsets[hedge + 1]; + + unsigned long long flags_all = 0; + unsigned long long flags_double = 0; + + for (int k = start; k < end; k++) { + int node = hyperedge_nodes[k]; + if (node >= 0 && node < num_nodes) { + int part = partition[node]; + if (part >= 0 && part < 64) { + unsigned long long bit = 1ULL << part; + flags_double |= (flags_all & bit); + flags_all |= bit; + } + } + } + + edge_flags_all[hedge] = flags_all; + edge_flags_double[hedge] = flags_double; + } +} + +extern "C" __global__ void compute_refinement_moves( + const int num_nodes, + const int num_parts, + const int max_part_size, + const int *node_hyperedges, + const int *node_offsets, + const int *partition, + const int *nodes_in_part, + const unsigned long long *edge_flags_all, + const unsigned long long *edge_flags_double, + int *move_parts, + int *move_priorities, + int *num_valid_moves, + unsigned long long *global_edge_flags +) { + int node = blockIdx.x * blockDim.x + threadIdx.x; + + if (node < num_nodes) { + move_parts[node] = partition[node]; + move_priorities[node] = 0; + + int current_part = partition[node]; + if (current_part < 0 || current_part >= num_parts || nodes_in_part[current_part] <= 1) return; + + int start = node_offsets[node]; + int end = node_offsets[node + 1]; + int node_degree = end - start; + int degree_weight = node_degree > 255 ? 255 : node_degree; + int used_degree = node_degree > 1024 ? 1024 : node_degree; + + unsigned long long *edge_flags = &global_edge_flags[node * 1024]; + unsigned long long cur_node_bit = 1ULL << current_part; + + for (int j = 0; j < used_degree; j++) { + int rel = (int)(((long long)j * node_degree) / used_degree); + int hyperedge = node_hyperedges[start + rel]; + + unsigned long long flags_all = edge_flags_all[hyperedge]; + unsigned long long flags_double = edge_flags_double[hyperedge]; + + edge_flags[j] = (flags_all & ~cur_node_bit) | (flags_double & cur_node_bit); + } + + int original_cost = 0; + for (int j = 0; j < used_degree; j++) { + int lambda = __popcll(edge_flags[j] | cur_node_bit); + if (lambda > 1) { + original_cost += (lambda - 1); + } + } + + int candidates[64]; + int num_candidates = 0; + bool seen[64] = {false}; + + for (int j = 0; j < used_degree; j++) { + unsigned long long flags = edge_flags[j]; + + while (flags) { + int bit = __ffsll(flags) - 1; + flags &= ~(1ULL << bit); + if (bit != current_part && !seen[bit] && num_candidates < 64) { + candidates[num_candidates++] = bit; + seen[bit] = true; + } + } + } + + int best_gain = 0; + int best_target = current_part; + + for (int i = 0; i < num_candidates; i++) { + int target_part = candidates[i]; + if (target_part < 0 || target_part >= num_parts) continue; + if (nodes_in_part[target_part] >= max_part_size) continue; + + int new_cost = 0; + for (int j = 0; j < used_degree; j++) { + int lambda = __popcll(edge_flags[j] | (1ULL << target_part)); + if (lambda > 1) { + new_cost += (lambda - 1); + } + } + + int basic_gain = original_cost - new_cost; + + int current_size = nodes_in_part[current_part]; + int target_size = nodes_in_part[target_part]; + int balance_bonus = 0; + + if (current_size > target_size + 1) { + balance_bonus = 4; + } + + int total_gain = basic_gain + balance_bonus; + + if (total_gain > best_gain || + (total_gain == best_gain && target_part < best_target)) { + best_gain = total_gain; + best_target = target_part; + } + } + + if (best_gain > 0 && best_target != current_part) { + move_parts[node] = best_target; + move_priorities[node] = (best_gain << 16) + (degree_weight << 8) + (num_parts - (node % num_parts)); + atomicAdd(num_valid_moves, 1); + } + } +} + +extern "C" __global__ void execute_refinement_moves( + const int num_valid_moves, + const int *sorted_nodes, + const int *sorted_parts, + const int max_part_size, + int *partition, + int *nodes_in_part, + int *moves_executed +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (int i = 0; i < num_valid_moves; i++) { + int node = sorted_nodes[i]; + int target_part = sorted_parts[i]; + + if (node >= 0 && target_part >= 0) { + int current_part = partition[node]; + + if (current_part >= 0 && + nodes_in_part[target_part] < max_part_size && + nodes_in_part[current_part] > 1 && + partition[node] == current_part) { + + partition[node] = target_part; + nodes_in_part[current_part]--; + nodes_in_part[target_part]++; + (*moves_executed)++; + } + } + } + } +} + +extern "C" __global__ void radix_histogram_chunked( + const int n, + const int num_chunks, + const int *keys, + const int shift, + int *chunk_histograms +) { + int chunk = blockIdx.x; + if (chunk >= num_chunks) return; + + __shared__ int local_hist[256]; + + for (int i = threadIdx.x; i < 256; i += blockDim.x) { + local_hist[i] = 0; + } + __syncthreads(); + + int chunk_start = chunk * 256; + int chunk_end = min(chunk_start + 256, n); + + for (int i = chunk_start + threadIdx.x; i < chunk_end; i += blockDim.x) { + int digit = (keys[i] >> shift) & 0xFF; + atomicAdd(&local_hist[digit], 1); + } + __syncthreads(); + + for (int d = threadIdx.x; d < 256; d += blockDim.x) { + chunk_histograms[chunk * 256 + d] = local_hist[d]; + } +} + +extern "C" __global__ void radix_prefix_and_scatter( + const int n, + const int num_chunks, + const int *keys_in, + const int *vals_in, + const int shift, + const int *chunk_histograms, + int *chunk_offsets, + int *keys_out, + int *vals_out, + int *ready_flag +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + int digit_totals[256]; + for (int d = 0; d < 256; d++) { + digit_totals[d] = 0; + for (int c = 0; c < num_chunks; c++) { + digit_totals[d] += chunk_histograms[c * 256 + d]; + } + } + + int digit_starts[256]; + int sum = 0; + for (int d = 0; d < 256; d++) { + digit_starts[d] = sum; + sum += digit_totals[d]; + } + + int running[256]; + for (int d = 0; d < 256; d++) running[d] = digit_starts[d]; + + for (int c = 0; c < num_chunks; c++) { + for (int d = 0; d < 256; d++) { + chunk_offsets[c * 256 + d] = running[d]; + running[d] += chunk_histograms[c * 256 + d]; + } + } + + __threadfence(); + atomicExch(ready_flag, 1); + } + + if (threadIdx.x == 0) { + while (atomicAdd(ready_flag, 0) == 0) {} + } + __syncthreads(); + + int chunk = blockIdx.x; + if (chunk >= num_chunks) return; + + __shared__ int offsets[256]; + + for (int d = threadIdx.x; d < 256; d += blockDim.x) { + offsets[d] = chunk_offsets[chunk * 256 + d]; + } + __syncthreads(); + + int chunk_start = chunk * 256; + int chunk_end = min(chunk_start + 256, n); + + if (threadIdx.x == 0) { + for (int i = chunk_start; i < chunk_end; i++) { + int key = keys_in[i]; + int digit = (key >> shift) & 0xFF; + int pos = offsets[digit]++; + keys_out[pos] = key; + vals_out[pos] = vals_in[i]; + } + } +} + +extern "C" __global__ void init_indices( + const int n, + int *indices +) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { + indices[i] = i; + } +} + +extern "C" __global__ void invert_keys( + const int n, + const int max_key, + const int *keys_in, + int *keys_out +) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { + keys_out[i] = max_key - keys_in[i]; + } +} + +extern "C" __global__ void gather_sorted( + const int n, + const int *sorted_indices, + const int *src, + int *dst +) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { + dst[i] = src[sorted_indices[i]]; + } +} + +extern "C" __global__ void balance_final( + const int num_nodes, + const int num_parts, + const int min_part_size, + const int max_part_size, + int *partition, + int *nodes_in_part +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (int part = 0; part < num_parts; part++) { + while (nodes_in_part[part] < min_part_size) { + bool moved = false; + for (int other_part = 0; other_part < num_parts && !moved; other_part++) { + if (other_part != part && nodes_in_part[other_part] > min_part_size) { + for (int node = 0; node < num_nodes; node++) { + if (partition[node] == other_part) { + partition[node] = part; + nodes_in_part[other_part]--; + nodes_in_part[part]++; + moved = true; + break; + } + } + } + } + if (!moved) break; + } + } + + for (int part = 0; part < num_parts; part++) { + while (nodes_in_part[part] > max_part_size) { + bool moved = false; + for (int other_part = 0; other_part < num_parts && !moved; other_part++) { + if (other_part != part && nodes_in_part[other_part] < max_part_size) { + for (int node = 0; node < num_nodes; node++) { + if (partition[node] == part) { + partition[node] = other_part; + nodes_in_part[part]--; + nodes_in_part[other_part]++; + moved = true; + break; + } + } + } + } + if (!moved) break; + } + } + } +} diff --git a/tig-algorithms/src/hypergraph/freud_opt/mod.rs b/tig-algorithms/src/hypergraph/freud_opt/mod.rs new file mode 100644 index 00000000..721e991d --- /dev/null +++ b/tig-algorithms/src/hypergraph/freud_opt/mod.rs @@ -0,0 +1,654 @@ +use cudarc::{ + driver::{safe::LaunchConfig, CudaModule, CudaStream, PushKernelArg}, + runtime::sys::cudaDeviceProp, +}; +use std::sync::Arc; +use std::time::Instant; +use serde_json::{Map, Value}; +use tig_challenges::hypergraph::*; + +pub fn help() { + println!("Hypergraph Partitioning Algorithm"); + println!("Adaptive clustering with GPU-accelerated refinement"); + println!(); + println!("Hyperparameters:"); + println!(" refinement - Number of refinement rounds (default: 500, range: 50-5000)"); + println!(); + println!("Usage:"); + println!(" Set the 'refinement' parameter in your benchmarker config"); + println!(" to balance between solution quality and runtime."); +} + +pub fn solve_challenge( + challenge: &Challenge, + save_solution: &dyn Fn(&Solution) -> anyhow::Result<()>, + hyperparameters: &Option>, + module: Arc, + stream: Arc, + prop: &cudaDeviceProp, +) -> anyhow::Result<()> { + println!(">>> solve_challenge START"); + let total_start = Instant::now(); + + let dummy_partition: Vec = (0..challenge.num_nodes as u32) + .map(|i| i % challenge.num_parts as u32) + .collect(); + save_solution(&Solution { partition: dummy_partition })?; + + let block_size = std::cmp::min(128, prop.maxThreadsPerBlock as u32); + + let t_load = Instant::now(); + let hyperedge_cluster_kernel = module.load_function("hyperedge_clustering")?; + let compute_preferences_kernel = module.load_function("compute_node_preferences")?; + let execute_assignments_kernel = module.load_function("execute_node_assignments")?; + let precompute_edge_flags_kernel = module.load_function("precompute_edge_flags")?; + let compute_moves_kernel = module.load_function("compute_refinement_moves")?; + let execute_moves_kernel = module.load_function("execute_refinement_moves")?; + let balance_kernel = module.load_function("balance_final")?; + let radix_hist_kernel = module.load_function("radix_histogram_chunked")?; + let radix_prefix_scatter_kernel = module.load_function("radix_prefix_and_scatter")?; + let init_indices_kernel = module.load_function("init_indices")?; + let invert_keys_kernel = module.load_function("invert_keys")?; + let gather_sorted_kernel = module.load_function("gather_sorted")?; + let t_load_elapsed = t_load.elapsed(); + + let cfg = LaunchConfig { + grid_dim: ((challenge.num_nodes as u32 + block_size - 1) / block_size, 1, 1), + block_dim: (block_size, 1, 1), + shared_mem_bytes: 0, + }; + + let one_thread_cfg = LaunchConfig { + grid_dim: (1, 1, 1), + block_dim: (1, 1, 1), + shared_mem_bytes: 0, + }; + + let hedge_cfg = LaunchConfig { + grid_dim: ((challenge.num_hyperedges as u32 + block_size - 1) / block_size, 1, 1), + block_dim: (block_size, 1, 1), + shared_mem_bytes: 0, + }; + + let mut num_hedge_clusters = 64; + + let t_alloc = Instant::now(); + let mut d_hyperedge_clusters = stream.alloc_zeros::(challenge.num_hyperedges as usize)?; + let mut d_partition = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_nodes_in_part = stream.alloc_zeros::(challenge.num_parts as usize)?; + + let mut d_pref_parts = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_pref_priorities = stream.alloc_zeros::(challenge.num_nodes as usize)?; + + let mut d_move_parts = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_move_priorities = stream.alloc_zeros::(challenge.num_nodes as usize)?; + + let buffer_size = (challenge.num_nodes as usize) * 1024; + let mut d_global_edge_flags = stream.alloc_zeros::(buffer_size)?; + + let mut d_edge_flags_all = stream.alloc_zeros::(challenge.num_hyperedges as usize)?; + let mut d_edge_flags_double = stream.alloc_zeros::(challenge.num_hyperedges as usize)?; + + let n = challenge.num_nodes as usize; + let mut d_sort_keys_a = stream.alloc_zeros::(n)?; + let mut d_sort_keys_b = stream.alloc_zeros::(n)?; + let mut d_sort_vals_a = stream.alloc_zeros::(n)?; + let mut d_sort_vals_b = stream.alloc_zeros::(n)?; + let mut d_sorted_move_parts = stream.alloc_zeros::(n)?; + + let num_chunks: i32 = ((n + 255) / 256) as i32; + let mut d_chunk_histograms = stream.alloc_zeros::((num_chunks as usize) * 256)?; + let mut d_chunk_offsets = stream.alloc_zeros::((num_chunks as usize) * 256)?; + let mut d_ready_flag = stream.alloc_zeros::(1)?; + let t_alloc_elapsed = t_alloc.elapsed(); + + let radix_cfg = LaunchConfig { + grid_dim: (num_chunks as u32, 1, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + + let mut sorted_move_nodes: Vec = Vec::with_capacity(n); + let mut sorted_move_parts_cpu: Vec = Vec::with_capacity(n); + let mut valid_indices: Vec = Vec::with_capacity(n); + + let default_refinement = if challenge.num_hyperedges < 20_000 { + 400usize + } else { + 500usize + }; + + println!("refinement: {:?}", hyperparameters.as_ref().and_then(|p| p.get("refinement"))); + + let refinement_rounds = if let Some(params) = hyperparameters { + params.get("refinement") + .and_then(|v| v.as_i64()) + .map(|v| v.clamp(50, 5000) as usize) + .unwrap_or(default_refinement) + } else { + default_refinement + }; + + let t_init = Instant::now(); + unsafe { + stream.launch_builder(&hyperedge_cluster_kernel) + .arg(&(challenge.num_hyperedges as i32)) + .arg(&(num_hedge_clusters as i32)) + .arg(&challenge.d_hyperedge_offsets) + .arg(&mut d_hyperedge_clusters) + .launch(LaunchConfig { + grid_dim: ((challenge.num_hyperedges as u32 + block_size - 1) / block_size, 1, 1), + block_dim: (block_size, 1, 1), + shared_mem_bytes: 0, + })?; + } + + unsafe { + stream.launch_builder(&compute_preferences_kernel) + .arg(&(challenge.num_nodes as i32)) + .arg(&(challenge.num_parts as i32)) + .arg(&(num_hedge_clusters as i32)) + .arg(&challenge.d_node_hyperedges) + .arg(&challenge.d_node_offsets) + .arg(&d_hyperedge_clusters) + .arg(&challenge.d_hyperedge_offsets) + .arg(&mut d_pref_parts) + .arg(&mut d_pref_priorities) + .launch(cfg.clone())?; + } + stream.synchronize()?; + + let pref_parts = stream.memcpy_dtov(&d_pref_parts)?; + let pref_priorities = stream.memcpy_dtov(&d_pref_priorities)?; + + let mut indices: Vec = (0..challenge.num_nodes as usize).collect(); + indices.sort_unstable_by(|&a, &b| pref_priorities[b].cmp(&pref_priorities[a])); + + let sorted_nodes: Vec = indices.iter().map(|&i| i as i32).collect(); + let sorted_parts: Vec = indices.iter().map(|&i| pref_parts[i]).collect(); + + let d_sorted_nodes = stream.memcpy_stod(&sorted_nodes)?; + let d_sorted_parts = stream.memcpy_stod(&sorted_parts)?; + + unsafe { + stream.launch_builder(&execute_assignments_kernel) + .arg(&(challenge.num_nodes as i32)) + .arg(&(challenge.num_parts as i32)) + .arg(&(challenge.max_part_size as i32)) + .arg(&d_sorted_nodes) + .arg(&d_sorted_parts) + .arg(&mut d_partition) + .arg(&mut d_nodes_in_part) + .launch(one_thread_cfg.clone())?; + } + stream.synchronize()?; + let t_init_elapsed = t_init.elapsed(); + + let mut stagnant_rounds = 0; + let early_exit_round = if challenge.num_hyperedges < 20_000 { 90 } else { 70 }; + let max_stagnant_rounds = if challenge.num_hyperedges < 20_000 { 30 } else { 20 }; + + let t_refine1 = Instant::now(); + let mut t_gpu_kernels = 0u128; + let mut t_gpu_sort = 0u128; + let mut t_cpu_sort = 0u128; + let mut t_execute = 0u128; + let mut actual_rounds = 0usize; + let mut gpu_sort_count = 0usize; + let mut cpu_sort_count = 0usize; + + for round in 0..refinement_rounds { + actual_rounds = round + 1; + let zero = vec![0i32]; + let mut d_num_valid_moves = stream.memcpy_stod(&zero)?; + + let t0 = Instant::now(); + unsafe { + stream.launch_builder(&precompute_edge_flags_kernel) + .arg(&(challenge.num_hyperedges as i32)) + .arg(&(challenge.num_nodes as i32)) + .arg(&challenge.d_hyperedge_nodes) + .arg(&challenge.d_hyperedge_offsets) + .arg(&d_partition) + .arg(&mut d_edge_flags_all) + .arg(&mut d_edge_flags_double) + .launch(hedge_cfg.clone())?; + } + + unsafe { + stream.launch_builder(&compute_moves_kernel) + .arg(&(challenge.num_nodes as i32)) + .arg(&(challenge.num_parts as i32)) + .arg(&(challenge.max_part_size as i32)) + .arg(&challenge.d_node_hyperedges) + .arg(&challenge.d_node_offsets) + .arg(&d_partition) + .arg(&d_nodes_in_part) + .arg(&d_edge_flags_all) + .arg(&d_edge_flags_double) + .arg(&mut d_move_parts) + .arg(&mut d_move_priorities) + .arg(&mut d_num_valid_moves) + .arg(&mut d_global_edge_flags) + .launch(cfg.clone())?; + } + stream.synchronize()?; + t_gpu_kernels += t0.elapsed().as_micros(); + + let num_valid_moves = stream.memcpy_dtov(&d_num_valid_moves)?[0]; + if num_valid_moves == 0 { + break; + } + + let t2 = Instant::now(); + let move_priorities_vec = stream.memcpy_dtov(&d_move_priorities)?; + let max_priority = move_priorities_vec.iter().copied().max().unwrap_or(0); + + let num_passes = if max_priority == 0 { + 0 + } else if max_priority < 256 { + 1 + } else if max_priority < 65536 { + 2 + } else if max_priority < 16777216 { + 3 + } else { + 4 + }; + + let use_gpu_sort = num_passes > 0 && num_passes <= 3; + + let (d_sorted_nodes_ref, d_sorted_parts_ref): (&cudarc::driver::CudaSlice, &cudarc::driver::CudaSlice); + let d_sorted_nodes_tmp: cudarc::driver::CudaSlice; + let d_sorted_parts_tmp: cudarc::driver::CudaSlice; + let num_to_process: i32; + + if use_gpu_sort { + unsafe { + stream.launch_builder(&invert_keys_kernel) + .arg(&(n as i32)) + .arg(&max_priority) + .arg(&d_move_priorities) + .arg(&mut d_sort_keys_a) + .launch(cfg.clone())?; + + stream.launch_builder(&init_indices_kernel) + .arg(&(n as i32)) + .arg(&mut d_sort_vals_a) + .launch(cfg.clone())?; + } + + for pass in 0..num_passes { + let shift = pass * 8; + + stream.memset_zeros(&mut d_ready_flag)?; + + if pass % 2 == 0 { + unsafe { + stream.launch_builder(&radix_hist_kernel) + .arg(&(n as i32)) + .arg(&num_chunks) + .arg(&d_sort_keys_a) + .arg(&shift) + .arg(&mut d_chunk_histograms) + .launch(radix_cfg.clone())?; + + stream.launch_builder(&radix_prefix_scatter_kernel) + .arg(&(n as i32)) + .arg(&num_chunks) + .arg(&d_sort_keys_a) + .arg(&d_sort_vals_a) + .arg(&shift) + .arg(&d_chunk_histograms) + .arg(&mut d_chunk_offsets) + .arg(&mut d_sort_keys_b) + .arg(&mut d_sort_vals_b) + .arg(&mut d_ready_flag) + .launch(radix_cfg.clone())?; + } + } else { + unsafe { + stream.launch_builder(&radix_hist_kernel) + .arg(&(n as i32)) + .arg(&num_chunks) + .arg(&d_sort_keys_b) + .arg(&shift) + .arg(&mut d_chunk_histograms) + .launch(radix_cfg.clone())?; + + stream.launch_builder(&radix_prefix_scatter_kernel) + .arg(&(n as i32)) + .arg(&num_chunks) + .arg(&d_sort_keys_b) + .arg(&d_sort_vals_b) + .arg(&shift) + .arg(&d_chunk_histograms) + .arg(&mut d_chunk_offsets) + .arg(&mut d_sort_keys_a) + .arg(&mut d_sort_vals_a) + .arg(&mut d_ready_flag) + .launch(radix_cfg.clone())?; + } + } + } + + let sorted_vals = if num_passes % 2 == 0 { &d_sort_vals_a } else { &d_sort_vals_b }; + + unsafe { + stream.launch_builder(&gather_sorted_kernel) + .arg(&(n as i32)) + .arg(sorted_vals) + .arg(&d_move_parts) + .arg(&mut d_sorted_move_parts) + .launch(cfg.clone())?; + } + stream.synchronize()?; + + d_sorted_nodes_ref = sorted_vals; + d_sorted_parts_ref = &d_sorted_move_parts; + num_to_process = n as i32; + t_gpu_sort += t2.elapsed().as_micros(); + gpu_sort_count += 1; + } else { + let t_cpu = Instant::now(); + let move_parts = stream.memcpy_dtov(&d_move_parts)?; + + valid_indices.clear(); + valid_indices.extend( + move_priorities_vec + .iter() + .enumerate() + .filter(|(_, &priority)| priority > 0) + .map(|(i, _)| i), + ); + + if valid_indices.is_empty() { + break; + } + + valid_indices.sort_unstable_by(|&a, &b| move_priorities_vec[b].cmp(&move_priorities_vec[a])); + + sorted_move_nodes.clear(); + sorted_move_parts_cpu.clear(); + sorted_move_nodes.extend(valid_indices.iter().map(|&i| i as i32)); + sorted_move_parts_cpu.extend(valid_indices.iter().map(|&i| move_parts[i])); + + d_sorted_nodes_tmp = stream.memcpy_stod(&sorted_move_nodes)?; + d_sorted_parts_tmp = stream.memcpy_stod(&sorted_move_parts_cpu)?; + d_sorted_nodes_ref = &d_sorted_nodes_tmp; + d_sorted_parts_ref = &d_sorted_parts_tmp; + num_to_process = sorted_move_nodes.len() as i32; + t_cpu_sort += t_cpu.elapsed().as_micros(); + cpu_sort_count += 1; + } + + let mut d_moves_executed = stream.alloc_zeros::(1)?; + + let t4 = Instant::now(); + unsafe { + stream.launch_builder(&execute_moves_kernel) + .arg(&num_to_process) + .arg(d_sorted_nodes_ref) + .arg(d_sorted_parts_ref) + .arg(&(challenge.max_part_size as i32)) + .arg(&mut d_partition) + .arg(&mut d_nodes_in_part) + .arg(&mut d_moves_executed) + .launch(one_thread_cfg.clone())?; + } + stream.synchronize()?; + t_execute += t4.elapsed().as_micros(); + + let moves_executed = stream.memcpy_dtov(&d_moves_executed)?[0]; + if moves_executed == 0 { + break; + } + + if moves_executed == 1 && round > early_exit_round { + stagnant_rounds += 1; + if stagnant_rounds > max_stagnant_rounds { + break; + } + } else { + stagnant_rounds = 0; + } + } + + let t_refine1_elapsed = t_refine1.elapsed(); + + let t_balance = Instant::now(); + unsafe { + stream.launch_builder(&balance_kernel) + .arg(&(challenge.num_nodes as i32)) + .arg(&(challenge.num_parts as i32)) + .arg(&1) + .arg(&(challenge.max_part_size as i32)) + .arg(&mut d_partition) + .arg(&mut d_nodes_in_part) + .launch(one_thread_cfg.clone())?; + } + stream.synchronize()?; + let t_balance_elapsed = t_balance.elapsed(); + + let t_refine2 = Instant::now(); + for _ in 0..24 { + let zero = vec![0i32]; + let mut d_num_valid_moves = stream.memcpy_stod(&zero)?; + + unsafe { + stream.launch_builder(&precompute_edge_flags_kernel) + .arg(&(challenge.num_hyperedges as i32)) + .arg(&(challenge.num_nodes as i32)) + .arg(&challenge.d_hyperedge_nodes) + .arg(&challenge.d_hyperedge_offsets) + .arg(&d_partition) + .arg(&mut d_edge_flags_all) + .arg(&mut d_edge_flags_double) + .launch(hedge_cfg.clone())?; + } + + unsafe { + stream.launch_builder(&compute_moves_kernel) + .arg(&(challenge.num_nodes as i32)) + .arg(&(challenge.num_parts as i32)) + .arg(&(challenge.max_part_size as i32)) + .arg(&challenge.d_node_hyperedges) + .arg(&challenge.d_node_offsets) + .arg(&d_partition) + .arg(&d_nodes_in_part) + .arg(&d_edge_flags_all) + .arg(&d_edge_flags_double) + .arg(&mut d_move_parts) + .arg(&mut d_move_priorities) + .arg(&mut d_num_valid_moves) + .arg(&mut d_global_edge_flags) + .launch(cfg.clone())?; + } + stream.synchronize()?; + + let num_valid_moves = stream.memcpy_dtov(&d_num_valid_moves)?[0]; + if num_valid_moves == 0 { + break; + } + + let move_priorities_vec2 = stream.memcpy_dtov(&d_move_priorities)?; + let max_priority2 = move_priorities_vec2.iter().copied().max().unwrap_or(0); + + let num_passes2 = if max_priority2 == 0 { + 0 + } else if max_priority2 < 256 { + 1 + } else if max_priority2 < 65536 { + 2 + } else if max_priority2 < 16777216 { + 3 + } else { + 4 + }; + + let use_gpu_sort = num_passes2 > 0 && num_passes2 <= 3; + + let d_sorted_nodes_ref2: &cudarc::driver::CudaSlice; + let d_sorted_parts_ref2: &cudarc::driver::CudaSlice; + let d_sorted_nodes_tmp2: cudarc::driver::CudaSlice; + let d_sorted_parts_tmp2: cudarc::driver::CudaSlice; + let num_to_process2: i32; + + if use_gpu_sort { + unsafe { + stream.launch_builder(&invert_keys_kernel) + .arg(&(n as i32)) + .arg(&max_priority2) + .arg(&d_move_priorities) + .arg(&mut d_sort_keys_a) + .launch(cfg.clone())?; + + stream.launch_builder(&init_indices_kernel) + .arg(&(n as i32)) + .arg(&mut d_sort_vals_a) + .launch(cfg.clone())?; + } + + for pass in 0..num_passes2 { + let shift = pass * 8; + + stream.memset_zeros(&mut d_ready_flag)?; + + if pass % 2 == 0 { + unsafe { + stream.launch_builder(&radix_hist_kernel) + .arg(&(n as i32)) + .arg(&num_chunks) + .arg(&d_sort_keys_a) + .arg(&shift) + .arg(&mut d_chunk_histograms) + .launch(radix_cfg.clone())?; + + stream.launch_builder(&radix_prefix_scatter_kernel) + .arg(&(n as i32)) + .arg(&num_chunks) + .arg(&d_sort_keys_a) + .arg(&d_sort_vals_a) + .arg(&shift) + .arg(&d_chunk_histograms) + .arg(&mut d_chunk_offsets) + .arg(&mut d_sort_keys_b) + .arg(&mut d_sort_vals_b) + .arg(&mut d_ready_flag) + .launch(radix_cfg.clone())?; + } + } else { + unsafe { + stream.launch_builder(&radix_hist_kernel) + .arg(&(n as i32)) + .arg(&num_chunks) + .arg(&d_sort_keys_b) + .arg(&shift) + .arg(&mut d_chunk_histograms) + .launch(radix_cfg.clone())?; + + stream.launch_builder(&radix_prefix_scatter_kernel) + .arg(&(n as i32)) + .arg(&num_chunks) + .arg(&d_sort_keys_b) + .arg(&d_sort_vals_b) + .arg(&shift) + .arg(&d_chunk_histograms) + .arg(&mut d_chunk_offsets) + .arg(&mut d_sort_keys_a) + .arg(&mut d_sort_vals_a) + .arg(&mut d_ready_flag) + .launch(radix_cfg.clone())?; + } + } + } + + let sorted_vals2 = if num_passes2 % 2 == 0 { &d_sort_vals_a } else { &d_sort_vals_b }; + + unsafe { + stream.launch_builder(&gather_sorted_kernel) + .arg(&(n as i32)) + .arg(sorted_vals2) + .arg(&d_move_parts) + .arg(&mut d_sorted_move_parts) + .launch(cfg.clone())?; + } + stream.synchronize()?; + + d_sorted_nodes_ref2 = sorted_vals2; + d_sorted_parts_ref2 = &d_sorted_move_parts; + num_to_process2 = n as i32; + } else { + let move_parts = stream.memcpy_dtov(&d_move_parts)?; + + valid_indices.clear(); + valid_indices.extend( + move_priorities_vec2 + .iter() + .enumerate() + .filter(|(_, &priority)| priority > 0) + .map(|(i, _)| i), + ); + + if valid_indices.is_empty() { + break; + } + + valid_indices.sort_unstable_by(|&a, &b| move_priorities_vec2[b].cmp(&move_priorities_vec2[a])); + + sorted_move_nodes.clear(); + sorted_move_parts_cpu.clear(); + sorted_move_nodes.extend(valid_indices.iter().map(|&i| i as i32)); + sorted_move_parts_cpu.extend(valid_indices.iter().map(|&i| move_parts[i])); + + d_sorted_nodes_tmp2 = stream.memcpy_stod(&sorted_move_nodes)?; + d_sorted_parts_tmp2 = stream.memcpy_stod(&sorted_move_parts_cpu)?; + d_sorted_nodes_ref2 = &d_sorted_nodes_tmp2; + d_sorted_parts_ref2 = &d_sorted_parts_tmp2; + num_to_process2 = sorted_move_nodes.len() as i32; + } + + let mut d_moves_executed = stream.alloc_zeros::(1)?; + + unsafe { + stream.launch_builder(&execute_moves_kernel) + .arg(&num_to_process2) + .arg(d_sorted_nodes_ref2) + .arg(d_sorted_parts_ref2) + .arg(&(challenge.max_part_size as i32)) + .arg(&mut d_partition) + .arg(&mut d_nodes_in_part) + .arg(&mut d_moves_executed) + .launch(one_thread_cfg.clone())?; + } + stream.synchronize()?; + + let moves_executed = stream.memcpy_dtov(&d_moves_executed)?[0]; + if moves_executed == 0 { + break; + } + } + let t_refine2_elapsed = t_refine2.elapsed(); + + let partition = stream.memcpy_dtov(&d_partition)?; + let partition_u32: Vec = partition.iter().map(|&x| x as u32).collect(); + + save_solution(&Solution { partition: partition_u32 })?; + + let total_elapsed = total_start.elapsed(); + println!("=== FULL PROFILING ==="); + println!("load_function: {:.2}ms", t_load_elapsed.as_micros() as f64 / 1000.0); + println!("alloc_zeros: {:.2}ms", t_alloc_elapsed.as_micros() as f64 / 1000.0); + println!("init (cluster+assign): {:.2}ms", t_init_elapsed.as_micros() as f64 / 1000.0); + println!("refine1 ({} rounds): {:.2}ms", actual_rounds, t_refine1_elapsed.as_micros() as f64 / 1000.0); + println!(" - GPU kernels: {:.2}ms", t_gpu_kernels as f64 / 1000.0); + println!(" - GPU sort: {:.2}ms ({} times)", t_gpu_sort as f64 / 1000.0, gpu_sort_count); + println!(" - CPU sort: {:.2}ms ({} times)", t_cpu_sort as f64 / 1000.0, cpu_sort_count); + println!(" - execute_moves: {:.2}ms", t_execute as f64 / 1000.0); + println!("balance: {:.2}ms", t_balance_elapsed.as_micros() as f64 / 1000.0); + println!("refine2 (24 rounds): {:.2}ms", t_refine2_elapsed.as_micros() as f64 / 1000.0); + println!("TOTAL: {:.2}ms", total_elapsed.as_micros() as f64 / 1000.0); + println!(">>> solve_challenge END"); + + Ok(()) +} diff --git a/tig-algorithms/src/hypergraph/mod.rs b/tig-algorithms/src/hypergraph/mod.rs index 9beec594..84b2ad14 100644 --- a/tig-algorithms/src/hypergraph/mod.rs +++ b/tig-algorithms/src/hypergraph/mod.rs @@ -20,7 +20,8 @@ // c005_a011 -// c005_a012 +pub mod freud_opt; +pub use freud_opt as c005_a012; // c005_a013