From bb2bc7f6158044ee9d2efea73c7eaf5f8c612e7b Mon Sep 17 00:00:00 2001 From: FiveMovesAhead Date: Tue, 2 Dec 2025 16:54:19 +0000 Subject: [PATCH] Submitted hypergraph/hyper_improved --- .../src/hypergraph/hyper_improved/README.md | 23 + .../src/hypergraph/hyper_improved/kernels.cu | 395 ++++++++++++++++++ .../src/hypergraph/hyper_improved/mod.rs | 240 +++++++++++ tig-algorithms/src/hypergraph/mod.rs | 3 +- 4 files changed, 660 insertions(+), 1 deletion(-) create mode 100644 tig-algorithms/src/hypergraph/hyper_improved/README.md create mode 100644 tig-algorithms/src/hypergraph/hyper_improved/kernels.cu create mode 100644 tig-algorithms/src/hypergraph/hyper_improved/mod.rs diff --git a/tig-algorithms/src/hypergraph/hyper_improved/README.md b/tig-algorithms/src/hypergraph/hyper_improved/README.md new file mode 100644 index 00000000..e530ff36 --- /dev/null +++ b/tig-algorithms/src/hypergraph/hyper_improved/README.md @@ -0,0 +1,23 @@ +# TIG Code Submission + +## Submission Details + +* **Challenge Name:** hypergraph +* **Algorithm Name:** hyper_improved +* **Copyright:** 2025 Rootz +* **Identity of Submitter:** Rootz +* **Identity of Creator of Algorithmic Method:** null +* **Unique Algorithm Identifier (UAI):** null + +## License + +The files in this folder are under the following licenses: +* TIG Benchmarker Outbound License +* TIG Commercial License +* TIG Inbound Game License +* TIG Innovator Outbound Game License +* TIG Open Data License +* TIG THV Game License + +Copies of the licenses can be obtained at: +https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses \ No newline at end of file diff --git a/tig-algorithms/src/hypergraph/hyper_improved/kernels.cu b/tig-algorithms/src/hypergraph/hyper_improved/kernels.cu new file mode 100644 index 00000000..50426474 --- /dev/null +++ b/tig-algorithms/src/hypergraph/hyper_improved/kernels.cu @@ -0,0 +1,395 @@ +/*!Copyright 2025 Rootz + +Identity of Submitter Rootz + +UAI null + +Licensed under the TIG Inbound Game License v2.0 or (at your option) any later +version (the "License"); you may not use this file except in compliance with the +License. You may obtain a copy of the License at + +https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses + +Unless required by applicable law or agreed to in writing, software distributed +under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +CONDITIONS OF ANY KIND, either express or implied. See the License for the specific +language governing permissions and limitations under the License. +*/ +#include +#include + +extern "C" __global__ void hyperedge_clustering( + const int num_hyperedges, + const int num_clusters, + const int *hyperedge_nodes, + const int *hyperedge_offsets, + int *hyperedge_clusters +) { + int hedge = blockIdx.x * blockDim.x + threadIdx.x; + + if (hedge < num_hyperedges) { + int start = hyperedge_offsets[hedge]; + int end = hyperedge_offsets[hedge + 1]; + int hedge_size = end - start; + + int quarter_clusters = num_clusters >> 2; + int cluster_mask = quarter_clusters - 1; + + int cluster; + if (hedge_size <= 2) { + cluster = hedge & cluster_mask; + } else if (hedge_size <= 4) { + cluster = quarter_clusters + (hedge & cluster_mask); + } else if (hedge_size <= 8) { + cluster = (quarter_clusters << 1) + (hedge & cluster_mask); + } else { + cluster = (quarter_clusters * 3) + (hedge & cluster_mask); + } + + hyperedge_clusters[hedge] = cluster; + } +} + +extern "C" __global__ void compute_node_preferences( + const int num_nodes, + const int num_parts, + const int num_hedge_clusters, + const int *node_hyperedges, + const int *node_offsets, + const int *hyperedge_clusters, + const int *hyperedge_offsets, + int *pref_nodes, + int *pref_parts, + int *pref_gains, + int *pref_priorities +) { + int node = blockIdx.x * blockDim.x + threadIdx.x; + + if (node < num_nodes) { + int start = node_offsets[node]; + int end = node_offsets[node + 1]; + int node_degree = end - start; + + int cluster_votes[256]; + int max_clusters = min(num_hedge_clusters, 256); + for (int i = 0; i < max_clusters; i++) { + cluster_votes[i] = 0; + } + + int max_votes = 0; + int best_cluster = 0; + + for (int j = start; j < end; j++) { + int hyperedge = node_hyperedges[j]; + int cluster = hyperedge_clusters[hyperedge]; + + if (cluster >= 0 && cluster < max_clusters) { + int hedge_start = hyperedge_offsets[hyperedge]; + int hedge_end = hyperedge_offsets[hyperedge + 1]; + int hedge_size = hedge_end - hedge_start; + int weight = (hedge_size <= 3) ? 4 : (hedge_size <= 6) ? 2 : 1; + + cluster_votes[cluster] += weight; + + if (cluster_votes[cluster] > max_votes || + (cluster_votes[cluster] == max_votes && cluster < best_cluster)) { + max_votes = cluster_votes[cluster]; + best_cluster = cluster; + } + } + } + + int target_partition; + if (node_degree <= 3) { + target_partition = (best_cluster + node) % num_parts; + } else if (node_degree <= 8) { + target_partition = (best_cluster + node_degree + node) % num_parts; + } else { + target_partition = (best_cluster * 2 + node_degree + node) % num_parts; + } + + pref_nodes[node] = node; + pref_parts[node] = target_partition; + pref_gains[node] = max_votes; + pref_priorities[node] = (max_votes << 16) + (num_parts - (node % num_parts)); + } +} + +extern "C" __global__ void execute_node_assignments( + const int num_nodes, + const int num_parts, + const int max_part_size, + const int *sorted_nodes, + const int *sorted_parts, + int *partition, + int *nodes_in_part +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (int i = 0; i < num_nodes; i++) { + int node = sorted_nodes[i]; + int preferred_part = sorted_parts[i]; + + if (node >= 0 && node < num_nodes && preferred_part >= 0 && preferred_part < num_parts) { + bool assigned = false; + for (int attempt = 0; attempt < num_parts; attempt++) { + int try_part = (preferred_part + attempt) % num_parts; + if (nodes_in_part[try_part] < max_part_size) { + partition[node] = try_part; + nodes_in_part[try_part]++; + assigned = true; + break; + } + } + + if (!assigned) { + int fallback_part = node % num_parts; + partition[node] = fallback_part; + nodes_in_part[fallback_part]++; + } + } + } + } +} + +extern "C" __global__ void compute_refinement_moves( + const int num_nodes, + const int num_parts, + const int max_part_size, + const int num_hyperedges, + const int *node_hyperedges, + const int *node_offsets, + const int *hyperedge_nodes, + const int *hyperedge_offsets, + const int *partition, + const int *nodes_in_part, + int *move_nodes, + int *move_parts, + int *move_gains, + int *move_priorities, + int *num_valid_moves, + const int round, + unsigned long long *global_edge_flags_low, + unsigned long long *global_edge_flags_high +) { + int node = blockIdx.x * blockDim.x + threadIdx.x; + + if (node < num_nodes) { + move_nodes[node] = node; + move_parts[node] = partition[node]; + move_gains[node] = 0; + move_priorities[node] = 0; + + int current_part = partition[node]; + if (current_part < 0 || current_part >= num_parts || nodes_in_part[current_part] <= 1) return; + + int start = node_offsets[node]; + int end = node_offsets[node + 1]; + int node_degree = end - start; + + if (node_degree > 3000) return; + + bool use_dual_buffer = (num_parts > 64); + + unsigned long long *edge_flags_low = &global_edge_flags_low[node * 3000]; + unsigned long long *edge_flags_high = &global_edge_flags_high[node * 3000]; + + for (int j = 0; j < node_degree; j++) { + edge_flags_low[j] = 0; + if (use_dual_buffer) { + edge_flags_high[j] = 0; + } + + int hyperedge = node_hyperedges[start + j]; + int hedge_start = hyperedge_offsets[hyperedge]; + int hedge_end = hyperedge_offsets[hyperedge + 1]; + + for (int k = hedge_start; k < hedge_end; k++) { + int other_node = hyperedge_nodes[k]; + if (other_node != node && other_node >= 0 && other_node < num_nodes) { + int part = partition[other_node]; + if (part >= 0 && part < num_parts) { + if (use_dual_buffer) { + if (part < 64) { + edge_flags_low[j] |= 1ULL << part; + } else { + edge_flags_high[j] |= 1ULL << (part - 64); + } + } else { + if (part < min(num_parts, 64)) { + edge_flags_low[j] |= 1ULL << part; + } + } + } + } + } + } + + int original_cost = 0; + for (int j = 0; j < node_degree; j++) { + unsigned long long current_low = edge_flags_low[j]; + unsigned long long current_high = use_dual_buffer ? edge_flags_high[j] : 0; + + int lambda; + if (use_dual_buffer) { + if (current_part < 64) { + current_low |= 1ULL << current_part; + } else { + current_high |= 1ULL << (current_part - 64); + } + lambda = __popcll(current_low) + __popcll(current_high); + } else { + lambda = __popcll(current_low | (1ULL << current_part)); + } + + if (lambda > 1) { + original_cost += (lambda - 1); + } + } + + int best_gain = 0; + int best_target = current_part; + + for (int offset = 0; offset < num_parts; offset++) { + int target_part = (node + round + offset) % num_parts; + if (target_part == current_part) continue; + if (target_part < 0 || target_part >= num_parts) continue; + if (nodes_in_part[target_part] >= max_part_size) continue; + + int new_cost = 0; + for (int j = 0; j < node_degree; j++) { + unsigned long long target_low = edge_flags_low[j]; + unsigned long long target_high = use_dual_buffer ? edge_flags_high[j] : 0; + + int lambda; + if (use_dual_buffer) { + if (target_part < 64) { + target_low |= 1ULL << target_part; + } else { + target_high |= 1ULL << (target_part - 64); + } + lambda = __popcll(target_low) + __popcll(target_high); + } else { + lambda = __popcll(target_low | (1ULL << target_part)); + } + + if (lambda > 1) { + new_cost += (lambda - 1); + } + } + + int basic_gain = original_cost - new_cost; + + int current_size = nodes_in_part[current_part]; + int target_size = nodes_in_part[target_part]; + int balance_bonus = 0; + + if (current_size > target_size + 1) { + if (num_parts >= 120) { + balance_bonus = 2; + } else if (num_parts >= 100) { + balance_bonus = 3; + } else { + balance_bonus = 4; + } + } + + int total_gain = basic_gain + balance_bonus; + + if (total_gain > best_gain || + (total_gain == best_gain && target_part < best_target)) { + best_gain = total_gain; + best_target = target_part; + } + } + + if (best_gain > 0 && best_target != current_part) { + move_parts[node] = best_target; + move_gains[node] = best_gain; + move_priorities[node] = (best_gain << 16) + (num_parts - (node % num_parts)); + atomicAdd(num_valid_moves, 1); + } + } +} + +extern "C" __global__ void execute_refinement_moves( + const int num_valid_moves, + const int *sorted_nodes, + const int *sorted_parts, + const int max_part_size, + int *partition, + int *nodes_in_part, + int *moves_executed +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (int i = 0; i < num_valid_moves; i++) { + int node = sorted_nodes[i]; + int target_part = sorted_parts[i]; + + if (node >= 0 && target_part >= 0) { + int current_part = partition[node]; + + if (current_part >= 0 && + nodes_in_part[target_part] < max_part_size && + nodes_in_part[current_part] > 1 && + partition[node] == current_part) { + + partition[node] = target_part; + nodes_in_part[current_part]--; + nodes_in_part[target_part]++; + (*moves_executed)++; + } + } + } + } +} + +extern "C" __global__ void balance_final( + const int num_nodes, + const int num_parts, + const int min_part_size, + const int max_part_size, + int *partition, + int *nodes_in_part +) { + if (blockIdx.x == 0 && threadIdx.x == 0) { + for (int part = 0; part < num_parts; part++) { + while (nodes_in_part[part] < min_part_size) { + bool moved = false; + for (int other_part = 0; other_part < num_parts && !moved; other_part++) { + if (other_part != part && nodes_in_part[other_part] > min_part_size) { + for (int node = 0; node < num_nodes; node++) { + if (partition[node] == other_part) { + partition[node] = part; + nodes_in_part[other_part]--; + nodes_in_part[part]++; + moved = true; + break; + } + } + } + } + if (!moved) break; + } + } + + for (int part = 0; part < num_parts; part++) { + while (nodes_in_part[part] > max_part_size) { + bool moved = false; + for (int other_part = 0; other_part < num_parts && !moved; other_part++) { + if (other_part != part && nodes_in_part[other_part] < max_part_size) { + for (int node = 0; node < num_nodes; node++) { + if (partition[node] == part) { + partition[node] = other_part; + nodes_in_part[part]--; + nodes_in_part[other_part]++; + moved = true; + break; + } + } + } + } + if (!moved) break; + } + } + } +} diff --git a/tig-algorithms/src/hypergraph/hyper_improved/mod.rs b/tig-algorithms/src/hypergraph/hyper_improved/mod.rs new file mode 100644 index 00000000..d782f84a --- /dev/null +++ b/tig-algorithms/src/hypergraph/hyper_improved/mod.rs @@ -0,0 +1,240 @@ +use cudarc::{ + driver::{safe::LaunchConfig, CudaModule, CudaStream, PushKernelArg}, + runtime::sys::cudaDeviceProp, +}; +use std::sync::Arc; +use serde_json::{Map, Value}; +use tig_challenges::hypergraph::*; + + +pub fn solve_challenge( + challenge: &Challenge, + save_solution: &dyn Fn(&Solution) -> anyhow::Result<()>, + hyperparameters: &Option>, + module: Arc, + stream: Arc, + prop: &cudaDeviceProp, +) -> anyhow::Result<()> { + let block_size = std::cmp::min(256, prop.maxThreadsPerBlock as u32); + + let hyperedge_cluster_kernel = module.load_function("hyperedge_clustering")?; + let compute_preferences_kernel = module.load_function("compute_node_preferences")?; + let execute_assignments_kernel = module.load_function("execute_node_assignments")?; + let compute_moves_kernel = module.load_function("compute_refinement_moves")?; + let execute_moves_kernel = module.load_function("execute_refinement_moves")?; + let balance_kernel = module.load_function("balance_final")?; + + let cfg = LaunchConfig { + grid_dim: ((challenge.num_nodes as u32 + block_size - 1) / block_size, 1, 1), + block_dim: (block_size, 1, 1), + shared_mem_bytes: 0, + }; + + let one_thread_cfg = LaunchConfig { + grid_dim: (1, 1, 1), + block_dim: (1, 1, 1), + shared_mem_bytes: 0, + }; + + let num_hedge_clusters = if challenge.num_hyperedges < 50000 { + let thousands = challenge.num_hyperedges / 1000; + match thousands { + 5 => 2, 6 => 2, 7 => 2, 8 => 2, 9 => 2, 10 => 2, 11 => 4, 12 => 6, 13 => 2, 14 => 2, + 15 => 6, 16 => 2, 17 => 2, 18 => 2, 19 => 4, 20 => 2, 21 => 4, 22 => 4, 23 => 4, 24 => 2, + 25 => 6, 26 => 2, 27 => 2, 28 => 2, 29 => 2, 30 => 2, 31 => 2, 32 => 2, 33 => 8, 34 => 8, + 35 => 8, 36 => 4, 37 => 8, 38 => 4, 39 => 6, 40 => 2, 41 => 2, 42 => 2, 43 => 6, 44 => 2, + 45 => 2, 46 => 2, 47 => 2, 48 => 2, 49 => 2, + _ => if thousands < 5 { 2 } else { 8 } + } + } else { + std::env::var("CLUSTER_SIZE").unwrap_or_else(|_| "8".to_string()).parse::().unwrap_or(8) + }; + + let mut d_hyperedge_clusters = stream.alloc_zeros::(challenge.num_hyperedges as usize)?; + let mut d_partition = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_nodes_in_part = stream.alloc_zeros::(challenge.num_parts as usize)?; + + let mut d_pref_nodes = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_pref_parts = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_pref_gains = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_pref_priorities = stream.alloc_zeros::(challenge.num_nodes as usize)?; + + let mut d_move_nodes = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_move_parts = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_move_gains = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_move_priorities = stream.alloc_zeros::(challenge.num_nodes as usize)?; + let mut d_num_valid_moves = stream.alloc_zeros::(1)?; + + let num_threads = ((challenge.num_nodes as u32 + block_size - 1) / block_size) * block_size; + let buffer_size = (num_threads * 3000) as usize; + let mut d_global_edge_flags_low = stream.alloc_zeros::(buffer_size)?; + + let mut d_global_edge_flags_high = if challenge.num_parts > 64 { + stream.alloc_zeros::(buffer_size)? + } else { + stream.alloc_zeros::(1)? + }; + + unsafe { + stream.launch_builder(&hyperedge_cluster_kernel) + .arg(&(challenge.num_hyperedges as i32)) + .arg(&(num_hedge_clusters as i32)) + .arg(&challenge.d_hyperedge_nodes) + .arg(&challenge.d_hyperedge_offsets) + .arg(&mut d_hyperedge_clusters) + .launch(LaunchConfig { + grid_dim: ((challenge.num_hyperedges as u32 + block_size - 1) / block_size, 1, 1), + block_dim: (block_size, 1, 1), + shared_mem_bytes: 0, + })?; + } + stream.synchronize()?; + + unsafe { + stream.launch_builder(&compute_preferences_kernel) + .arg(&(challenge.num_nodes as i32)) + .arg(&(challenge.num_parts as i32)) + .arg(&(num_hedge_clusters as i32)) + .arg(&challenge.d_node_hyperedges) + .arg(&challenge.d_node_offsets) + .arg(&d_hyperedge_clusters) + .arg(&challenge.d_hyperedge_offsets) + .arg(&mut d_pref_nodes) + .arg(&mut d_pref_parts) + .arg(&mut d_pref_gains) + .arg(&mut d_pref_priorities) + .launch(cfg.clone())?; + } + stream.synchronize()?; + + let pref_nodes = stream.memcpy_dtov(&d_pref_nodes)?; + let pref_parts = stream.memcpy_dtov(&d_pref_parts)?; + let pref_priorities = stream.memcpy_dtov(&d_pref_priorities)?; + + let mut indices: Vec = (0..challenge.num_nodes as usize).collect(); + indices.sort_by(|&a, &b| pref_priorities[b].cmp(&pref_priorities[a])); + + let sorted_nodes: Vec = indices.iter().map(|&i| pref_nodes[i]).collect(); + let sorted_parts: Vec = indices.iter().map(|&i| pref_parts[i]).collect(); + + let d_sorted_nodes = stream.memcpy_stod(&sorted_nodes)?; + let d_sorted_parts = stream.memcpy_stod(&sorted_parts)?; + + unsafe { + stream.launch_builder(&execute_assignments_kernel) + .arg(&(challenge.num_nodes as i32)) + .arg(&(challenge.num_parts as i32)) + .arg(&(challenge.max_part_size as i32)) + .arg(&d_sorted_nodes) + .arg(&d_sorted_parts) + .arg(&mut d_partition) + .arg(&mut d_nodes_in_part) + .launch(one_thread_cfg.clone())?; + } + stream.synchronize()?; + + let mut valid_moves: Vec<(i32, i32, i32)> = Vec::with_capacity(challenge.num_nodes as usize); + let mut sorted_move_nodes: Vec = Vec::with_capacity(challenge.num_nodes as usize); + let mut sorted_move_parts: Vec = Vec::with_capacity(challenge.num_nodes as usize); + + for round in 0..100 { + unsafe { + stream.launch_builder(&compute_moves_kernel) + .arg(&(challenge.num_nodes as i32)) + .arg(&(challenge.num_parts as i32)) + .arg(&(challenge.max_part_size as i32)) + .arg(&(challenge.num_hyperedges as i32)) + .arg(&challenge.d_node_hyperedges) + .arg(&challenge.d_node_offsets) + .arg(&challenge.d_hyperedge_nodes) + .arg(&challenge.d_hyperedge_offsets) + .arg(&d_partition) + .arg(&d_nodes_in_part) + .arg(&mut d_move_nodes) + .arg(&mut d_move_parts) + .arg(&mut d_move_gains) + .arg(&mut d_move_priorities) + .arg(&mut d_num_valid_moves) + .arg(&round) + .arg(&mut d_global_edge_flags_low) + .arg(&mut d_global_edge_flags_high) + .launch(cfg.clone())?; + } + stream.synchronize()?; + + let num_valid_moves = stream.memcpy_dtov(&d_num_valid_moves)?[0]; + if num_valid_moves == 0 { + break; + } + + let move_gains = stream.memcpy_dtov(&d_move_gains)?; + let valid_indices: Vec = move_gains.iter().enumerate() + .filter(|(_, &gain)| gain > 0) + .map(|(i, _)| i) + .collect(); + + if valid_indices.is_empty() { + break; + } + + let move_nodes = stream.memcpy_dtov(&d_move_nodes)?; + let move_parts = stream.memcpy_dtov(&d_move_parts)?; + let move_priorities = stream.memcpy_dtov(&d_move_priorities)?; + + valid_moves.clear(); + for &i in &valid_indices { + valid_moves.push((move_nodes[i], move_parts[i], move_priorities[i])); + } + + valid_moves.sort_by(|a, b| b.2.cmp(&a.2)); + + sorted_move_nodes.clear(); + sorted_move_parts.clear(); + sorted_move_nodes.extend(valid_moves.iter().map(|&(node, _, _)| node)); + sorted_move_parts.extend(valid_moves.iter().map(|&(_, part, _)| part)); + + let d_sorted_move_nodes = stream.memcpy_stod(&sorted_move_nodes)?; + let d_sorted_move_parts = stream.memcpy_stod(&sorted_move_parts)?; + let mut d_moves_executed = stream.alloc_zeros::(1)?; + + unsafe { + stream.launch_builder(&execute_moves_kernel) + .arg(&(sorted_move_nodes.len() as i32)) + .arg(&d_sorted_move_nodes) + .arg(&d_sorted_move_parts) + .arg(&(challenge.max_part_size as i32)) + .arg(&mut d_partition) + .arg(&mut d_nodes_in_part) + .arg(&mut d_moves_executed) + .launch(one_thread_cfg.clone())?; + } + stream.synchronize()?; + + let moves_executed = stream.memcpy_dtov(&d_moves_executed)?[0]; + if moves_executed == 0 { + break; + } + } + + unsafe { + stream.launch_builder(&balance_kernel) + .arg(&(challenge.num_nodes as i32)) + .arg(&(challenge.num_parts as i32)) + .arg(&1) + .arg(&(challenge.max_part_size as i32)) + .arg(&mut d_partition) + .arg(&mut d_nodes_in_part) + .launch(one_thread_cfg.clone())?; + } + stream.synchronize()?; + + let partition = stream.memcpy_dtov(&d_partition)?; + let partition_u32: Vec = partition.iter().map(|&x| x as u32).collect(); + + let _ = save_solution(&Solution { partition: partition_u32 }); + return Ok(()); +} + +pub fn help() { + println!("No help information available."); +} diff --git a/tig-algorithms/src/hypergraph/mod.rs b/tig-algorithms/src/hypergraph/mod.rs index 9beec594..e205b449 100644 --- a/tig-algorithms/src/hypergraph/mod.rs +++ b/tig-algorithms/src/hypergraph/mod.rs @@ -2,7 +2,8 @@ // c005_a002 -// c005_a003 +pub mod hyper_improved; +pub use hyper_improved as c005_a003; // c005_a004