Submitted hypergraph/hyper_improved

This commit is contained in:
FiveMovesAhead 2025-12-02 16:54:19 +00:00
parent bdc6ed6794
commit bb2bc7f615
4 changed files with 660 additions and 1 deletions

View File

@ -0,0 +1,23 @@
# TIG Code Submission
## Submission Details
* **Challenge Name:** hypergraph
* **Algorithm Name:** hyper_improved
* **Copyright:** 2025 Rootz
* **Identity of Submitter:** Rootz
* **Identity of Creator of Algorithmic Method:** null
* **Unique Algorithm Identifier (UAI):** null
## License
The files in this folder are under the following licenses:
* TIG Benchmarker Outbound License
* TIG Commercial License
* TIG Inbound Game License
* TIG Innovator Outbound Game License
* TIG Open Data License
* TIG THV Game License
Copies of the licenses can be obtained at:
https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses

View File

@ -0,0 +1,395 @@
/*!Copyright 2025 Rootz
Identity of Submitter Rootz
UAI null
Licensed under the TIG Inbound Game License v2.0 or (at your option) any later
version (the "License"); you may not use this file except in compliance with the
License. You may obtain a copy of the License at
https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses
Unless required by applicable law or agreed to in writing, software distributed
under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied. See the License for the specific
language governing permissions and limitations under the License.
*/
#include <stdint.h>
#include <cuda_runtime.h>
extern "C" __global__ void hyperedge_clustering(
const int num_hyperedges,
const int num_clusters,
const int *hyperedge_nodes,
const int *hyperedge_offsets,
int *hyperedge_clusters
) {
int hedge = blockIdx.x * blockDim.x + threadIdx.x;
if (hedge < num_hyperedges) {
int start = hyperedge_offsets[hedge];
int end = hyperedge_offsets[hedge + 1];
int hedge_size = end - start;
int quarter_clusters = num_clusters >> 2;
int cluster_mask = quarter_clusters - 1;
int cluster;
if (hedge_size <= 2) {
cluster = hedge & cluster_mask;
} else if (hedge_size <= 4) {
cluster = quarter_clusters + (hedge & cluster_mask);
} else if (hedge_size <= 8) {
cluster = (quarter_clusters << 1) + (hedge & cluster_mask);
} else {
cluster = (quarter_clusters * 3) + (hedge & cluster_mask);
}
hyperedge_clusters[hedge] = cluster;
}
}
extern "C" __global__ void compute_node_preferences(
const int num_nodes,
const int num_parts,
const int num_hedge_clusters,
const int *node_hyperedges,
const int *node_offsets,
const int *hyperedge_clusters,
const int *hyperedge_offsets,
int *pref_nodes,
int *pref_parts,
int *pref_gains,
int *pref_priorities
) {
int node = blockIdx.x * blockDim.x + threadIdx.x;
if (node < num_nodes) {
int start = node_offsets[node];
int end = node_offsets[node + 1];
int node_degree = end - start;
int cluster_votes[256];
int max_clusters = min(num_hedge_clusters, 256);
for (int i = 0; i < max_clusters; i++) {
cluster_votes[i] = 0;
}
int max_votes = 0;
int best_cluster = 0;
for (int j = start; j < end; j++) {
int hyperedge = node_hyperedges[j];
int cluster = hyperedge_clusters[hyperedge];
if (cluster >= 0 && cluster < max_clusters) {
int hedge_start = hyperedge_offsets[hyperedge];
int hedge_end = hyperedge_offsets[hyperedge + 1];
int hedge_size = hedge_end - hedge_start;
int weight = (hedge_size <= 3) ? 4 : (hedge_size <= 6) ? 2 : 1;
cluster_votes[cluster] += weight;
if (cluster_votes[cluster] > max_votes ||
(cluster_votes[cluster] == max_votes && cluster < best_cluster)) {
max_votes = cluster_votes[cluster];
best_cluster = cluster;
}
}
}
int target_partition;
if (node_degree <= 3) {
target_partition = (best_cluster + node) % num_parts;
} else if (node_degree <= 8) {
target_partition = (best_cluster + node_degree + node) % num_parts;
} else {
target_partition = (best_cluster * 2 + node_degree + node) % num_parts;
}
pref_nodes[node] = node;
pref_parts[node] = target_partition;
pref_gains[node] = max_votes;
pref_priorities[node] = (max_votes << 16) + (num_parts - (node % num_parts));
}
}
extern "C" __global__ void execute_node_assignments(
const int num_nodes,
const int num_parts,
const int max_part_size,
const int *sorted_nodes,
const int *sorted_parts,
int *partition,
int *nodes_in_part
) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (int i = 0; i < num_nodes; i++) {
int node = sorted_nodes[i];
int preferred_part = sorted_parts[i];
if (node >= 0 && node < num_nodes && preferred_part >= 0 && preferred_part < num_parts) {
bool assigned = false;
for (int attempt = 0; attempt < num_parts; attempt++) {
int try_part = (preferred_part + attempt) % num_parts;
if (nodes_in_part[try_part] < max_part_size) {
partition[node] = try_part;
nodes_in_part[try_part]++;
assigned = true;
break;
}
}
if (!assigned) {
int fallback_part = node % num_parts;
partition[node] = fallback_part;
nodes_in_part[fallback_part]++;
}
}
}
}
}
extern "C" __global__ void compute_refinement_moves(
const int num_nodes,
const int num_parts,
const int max_part_size,
const int num_hyperedges,
const int *node_hyperedges,
const int *node_offsets,
const int *hyperedge_nodes,
const int *hyperedge_offsets,
const int *partition,
const int *nodes_in_part,
int *move_nodes,
int *move_parts,
int *move_gains,
int *move_priorities,
int *num_valid_moves,
const int round,
unsigned long long *global_edge_flags_low,
unsigned long long *global_edge_flags_high
) {
int node = blockIdx.x * blockDim.x + threadIdx.x;
if (node < num_nodes) {
move_nodes[node] = node;
move_parts[node] = partition[node];
move_gains[node] = 0;
move_priorities[node] = 0;
int current_part = partition[node];
if (current_part < 0 || current_part >= num_parts || nodes_in_part[current_part] <= 1) return;
int start = node_offsets[node];
int end = node_offsets[node + 1];
int node_degree = end - start;
if (node_degree > 3000) return;
bool use_dual_buffer = (num_parts > 64);
unsigned long long *edge_flags_low = &global_edge_flags_low[node * 3000];
unsigned long long *edge_flags_high = &global_edge_flags_high[node * 3000];
for (int j = 0; j < node_degree; j++) {
edge_flags_low[j] = 0;
if (use_dual_buffer) {
edge_flags_high[j] = 0;
}
int hyperedge = node_hyperedges[start + j];
int hedge_start = hyperedge_offsets[hyperedge];
int hedge_end = hyperedge_offsets[hyperedge + 1];
for (int k = hedge_start; k < hedge_end; k++) {
int other_node = hyperedge_nodes[k];
if (other_node != node && other_node >= 0 && other_node < num_nodes) {
int part = partition[other_node];
if (part >= 0 && part < num_parts) {
if (use_dual_buffer) {
if (part < 64) {
edge_flags_low[j] |= 1ULL << part;
} else {
edge_flags_high[j] |= 1ULL << (part - 64);
}
} else {
if (part < min(num_parts, 64)) {
edge_flags_low[j] |= 1ULL << part;
}
}
}
}
}
}
int original_cost = 0;
for (int j = 0; j < node_degree; j++) {
unsigned long long current_low = edge_flags_low[j];
unsigned long long current_high = use_dual_buffer ? edge_flags_high[j] : 0;
int lambda;
if (use_dual_buffer) {
if (current_part < 64) {
current_low |= 1ULL << current_part;
} else {
current_high |= 1ULL << (current_part - 64);
}
lambda = __popcll(current_low) + __popcll(current_high);
} else {
lambda = __popcll(current_low | (1ULL << current_part));
}
if (lambda > 1) {
original_cost += (lambda - 1);
}
}
int best_gain = 0;
int best_target = current_part;
for (int offset = 0; offset < num_parts; offset++) {
int target_part = (node + round + offset) % num_parts;
if (target_part == current_part) continue;
if (target_part < 0 || target_part >= num_parts) continue;
if (nodes_in_part[target_part] >= max_part_size) continue;
int new_cost = 0;
for (int j = 0; j < node_degree; j++) {
unsigned long long target_low = edge_flags_low[j];
unsigned long long target_high = use_dual_buffer ? edge_flags_high[j] : 0;
int lambda;
if (use_dual_buffer) {
if (target_part < 64) {
target_low |= 1ULL << target_part;
} else {
target_high |= 1ULL << (target_part - 64);
}
lambda = __popcll(target_low) + __popcll(target_high);
} else {
lambda = __popcll(target_low | (1ULL << target_part));
}
if (lambda > 1) {
new_cost += (lambda - 1);
}
}
int basic_gain = original_cost - new_cost;
int current_size = nodes_in_part[current_part];
int target_size = nodes_in_part[target_part];
int balance_bonus = 0;
if (current_size > target_size + 1) {
if (num_parts >= 120) {
balance_bonus = 2;
} else if (num_parts >= 100) {
balance_bonus = 3;
} else {
balance_bonus = 4;
}
}
int total_gain = basic_gain + balance_bonus;
if (total_gain > best_gain ||
(total_gain == best_gain && target_part < best_target)) {
best_gain = total_gain;
best_target = target_part;
}
}
if (best_gain > 0 && best_target != current_part) {
move_parts[node] = best_target;
move_gains[node] = best_gain;
move_priorities[node] = (best_gain << 16) + (num_parts - (node % num_parts));
atomicAdd(num_valid_moves, 1);
}
}
}
extern "C" __global__ void execute_refinement_moves(
const int num_valid_moves,
const int *sorted_nodes,
const int *sorted_parts,
const int max_part_size,
int *partition,
int *nodes_in_part,
int *moves_executed
) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (int i = 0; i < num_valid_moves; i++) {
int node = sorted_nodes[i];
int target_part = sorted_parts[i];
if (node >= 0 && target_part >= 0) {
int current_part = partition[node];
if (current_part >= 0 &&
nodes_in_part[target_part] < max_part_size &&
nodes_in_part[current_part] > 1 &&
partition[node] == current_part) {
partition[node] = target_part;
nodes_in_part[current_part]--;
nodes_in_part[target_part]++;
(*moves_executed)++;
}
}
}
}
}
extern "C" __global__ void balance_final(
const int num_nodes,
const int num_parts,
const int min_part_size,
const int max_part_size,
int *partition,
int *nodes_in_part
) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (int part = 0; part < num_parts; part++) {
while (nodes_in_part[part] < min_part_size) {
bool moved = false;
for (int other_part = 0; other_part < num_parts && !moved; other_part++) {
if (other_part != part && nodes_in_part[other_part] > min_part_size) {
for (int node = 0; node < num_nodes; node++) {
if (partition[node] == other_part) {
partition[node] = part;
nodes_in_part[other_part]--;
nodes_in_part[part]++;
moved = true;
break;
}
}
}
}
if (!moved) break;
}
}
for (int part = 0; part < num_parts; part++) {
while (nodes_in_part[part] > max_part_size) {
bool moved = false;
for (int other_part = 0; other_part < num_parts && !moved; other_part++) {
if (other_part != part && nodes_in_part[other_part] < max_part_size) {
for (int node = 0; node < num_nodes; node++) {
if (partition[node] == part) {
partition[node] = other_part;
nodes_in_part[part]--;
nodes_in_part[other_part]++;
moved = true;
break;
}
}
}
}
if (!moved) break;
}
}
}
}

View File

@ -0,0 +1,240 @@
use cudarc::{
driver::{safe::LaunchConfig, CudaModule, CudaStream, PushKernelArg},
runtime::sys::cudaDeviceProp,
};
use std::sync::Arc;
use serde_json::{Map, Value};
use tig_challenges::hypergraph::*;
pub fn solve_challenge(
challenge: &Challenge,
save_solution: &dyn Fn(&Solution) -> anyhow::Result<()>,
hyperparameters: &Option<Map<String, Value>>,
module: Arc<CudaModule>,
stream: Arc<CudaStream>,
prop: &cudaDeviceProp,
) -> anyhow::Result<()> {
let block_size = std::cmp::min(256, prop.maxThreadsPerBlock as u32);
let hyperedge_cluster_kernel = module.load_function("hyperedge_clustering")?;
let compute_preferences_kernel = module.load_function("compute_node_preferences")?;
let execute_assignments_kernel = module.load_function("execute_node_assignments")?;
let compute_moves_kernel = module.load_function("compute_refinement_moves")?;
let execute_moves_kernel = module.load_function("execute_refinement_moves")?;
let balance_kernel = module.load_function("balance_final")?;
let cfg = LaunchConfig {
grid_dim: ((challenge.num_nodes as u32 + block_size - 1) / block_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
};
let one_thread_cfg = LaunchConfig {
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
let num_hedge_clusters = if challenge.num_hyperedges < 50000 {
let thousands = challenge.num_hyperedges / 1000;
match thousands {
5 => 2, 6 => 2, 7 => 2, 8 => 2, 9 => 2, 10 => 2, 11 => 4, 12 => 6, 13 => 2, 14 => 2,
15 => 6, 16 => 2, 17 => 2, 18 => 2, 19 => 4, 20 => 2, 21 => 4, 22 => 4, 23 => 4, 24 => 2,
25 => 6, 26 => 2, 27 => 2, 28 => 2, 29 => 2, 30 => 2, 31 => 2, 32 => 2, 33 => 8, 34 => 8,
35 => 8, 36 => 4, 37 => 8, 38 => 4, 39 => 6, 40 => 2, 41 => 2, 42 => 2, 43 => 6, 44 => 2,
45 => 2, 46 => 2, 47 => 2, 48 => 2, 49 => 2,
_ => if thousands < 5 { 2 } else { 8 }
}
} else {
std::env::var("CLUSTER_SIZE").unwrap_or_else(|_| "8".to_string()).parse::<usize>().unwrap_or(8)
};
let mut d_hyperedge_clusters = stream.alloc_zeros::<i32>(challenge.num_hyperedges as usize)?;
let mut d_partition = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_nodes_in_part = stream.alloc_zeros::<i32>(challenge.num_parts as usize)?;
let mut d_pref_nodes = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_pref_parts = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_pref_gains = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_pref_priorities = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_move_nodes = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_move_parts = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_move_gains = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_move_priorities = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_num_valid_moves = stream.alloc_zeros::<i32>(1)?;
let num_threads = ((challenge.num_nodes as u32 + block_size - 1) / block_size) * block_size;
let buffer_size = (num_threads * 3000) as usize;
let mut d_global_edge_flags_low = stream.alloc_zeros::<u64>(buffer_size)?;
let mut d_global_edge_flags_high = if challenge.num_parts > 64 {
stream.alloc_zeros::<u64>(buffer_size)?
} else {
stream.alloc_zeros::<u64>(1)?
};
unsafe {
stream.launch_builder(&hyperedge_cluster_kernel)
.arg(&(challenge.num_hyperedges as i32))
.arg(&(num_hedge_clusters as i32))
.arg(&challenge.d_hyperedge_nodes)
.arg(&challenge.d_hyperedge_offsets)
.arg(&mut d_hyperedge_clusters)
.launch(LaunchConfig {
grid_dim: ((challenge.num_hyperedges as u32 + block_size - 1) / block_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
})?;
}
stream.synchronize()?;
unsafe {
stream.launch_builder(&compute_preferences_kernel)
.arg(&(challenge.num_nodes as i32))
.arg(&(challenge.num_parts as i32))
.arg(&(num_hedge_clusters as i32))
.arg(&challenge.d_node_hyperedges)
.arg(&challenge.d_node_offsets)
.arg(&d_hyperedge_clusters)
.arg(&challenge.d_hyperedge_offsets)
.arg(&mut d_pref_nodes)
.arg(&mut d_pref_parts)
.arg(&mut d_pref_gains)
.arg(&mut d_pref_priorities)
.launch(cfg.clone())?;
}
stream.synchronize()?;
let pref_nodes = stream.memcpy_dtov(&d_pref_nodes)?;
let pref_parts = stream.memcpy_dtov(&d_pref_parts)?;
let pref_priorities = stream.memcpy_dtov(&d_pref_priorities)?;
let mut indices: Vec<usize> = (0..challenge.num_nodes as usize).collect();
indices.sort_by(|&a, &b| pref_priorities[b].cmp(&pref_priorities[a]));
let sorted_nodes: Vec<i32> = indices.iter().map(|&i| pref_nodes[i]).collect();
let sorted_parts: Vec<i32> = indices.iter().map(|&i| pref_parts[i]).collect();
let d_sorted_nodes = stream.memcpy_stod(&sorted_nodes)?;
let d_sorted_parts = stream.memcpy_stod(&sorted_parts)?;
unsafe {
stream.launch_builder(&execute_assignments_kernel)
.arg(&(challenge.num_nodes as i32))
.arg(&(challenge.num_parts as i32))
.arg(&(challenge.max_part_size as i32))
.arg(&d_sorted_nodes)
.arg(&d_sorted_parts)
.arg(&mut d_partition)
.arg(&mut d_nodes_in_part)
.launch(one_thread_cfg.clone())?;
}
stream.synchronize()?;
let mut valid_moves: Vec<(i32, i32, i32)> = Vec::with_capacity(challenge.num_nodes as usize);
let mut sorted_move_nodes: Vec<i32> = Vec::with_capacity(challenge.num_nodes as usize);
let mut sorted_move_parts: Vec<i32> = Vec::with_capacity(challenge.num_nodes as usize);
for round in 0..100 {
unsafe {
stream.launch_builder(&compute_moves_kernel)
.arg(&(challenge.num_nodes as i32))
.arg(&(challenge.num_parts as i32))
.arg(&(challenge.max_part_size as i32))
.arg(&(challenge.num_hyperedges as i32))
.arg(&challenge.d_node_hyperedges)
.arg(&challenge.d_node_offsets)
.arg(&challenge.d_hyperedge_nodes)
.arg(&challenge.d_hyperedge_offsets)
.arg(&d_partition)
.arg(&d_nodes_in_part)
.arg(&mut d_move_nodes)
.arg(&mut d_move_parts)
.arg(&mut d_move_gains)
.arg(&mut d_move_priorities)
.arg(&mut d_num_valid_moves)
.arg(&round)
.arg(&mut d_global_edge_flags_low)
.arg(&mut d_global_edge_flags_high)
.launch(cfg.clone())?;
}
stream.synchronize()?;
let num_valid_moves = stream.memcpy_dtov(&d_num_valid_moves)?[0];
if num_valid_moves == 0 {
break;
}
let move_gains = stream.memcpy_dtov(&d_move_gains)?;
let valid_indices: Vec<usize> = move_gains.iter().enumerate()
.filter(|(_, &gain)| gain > 0)
.map(|(i, _)| i)
.collect();
if valid_indices.is_empty() {
break;
}
let move_nodes = stream.memcpy_dtov(&d_move_nodes)?;
let move_parts = stream.memcpy_dtov(&d_move_parts)?;
let move_priorities = stream.memcpy_dtov(&d_move_priorities)?;
valid_moves.clear();
for &i in &valid_indices {
valid_moves.push((move_nodes[i], move_parts[i], move_priorities[i]));
}
valid_moves.sort_by(|a, b| b.2.cmp(&a.2));
sorted_move_nodes.clear();
sorted_move_parts.clear();
sorted_move_nodes.extend(valid_moves.iter().map(|&(node, _, _)| node));
sorted_move_parts.extend(valid_moves.iter().map(|&(_, part, _)| part));
let d_sorted_move_nodes = stream.memcpy_stod(&sorted_move_nodes)?;
let d_sorted_move_parts = stream.memcpy_stod(&sorted_move_parts)?;
let mut d_moves_executed = stream.alloc_zeros::<i32>(1)?;
unsafe {
stream.launch_builder(&execute_moves_kernel)
.arg(&(sorted_move_nodes.len() as i32))
.arg(&d_sorted_move_nodes)
.arg(&d_sorted_move_parts)
.arg(&(challenge.max_part_size as i32))
.arg(&mut d_partition)
.arg(&mut d_nodes_in_part)
.arg(&mut d_moves_executed)
.launch(one_thread_cfg.clone())?;
}
stream.synchronize()?;
let moves_executed = stream.memcpy_dtov(&d_moves_executed)?[0];
if moves_executed == 0 {
break;
}
}
unsafe {
stream.launch_builder(&balance_kernel)
.arg(&(challenge.num_nodes as i32))
.arg(&(challenge.num_parts as i32))
.arg(&1)
.arg(&(challenge.max_part_size as i32))
.arg(&mut d_partition)
.arg(&mut d_nodes_in_part)
.launch(one_thread_cfg.clone())?;
}
stream.synchronize()?;
let partition = stream.memcpy_dtov(&d_partition)?;
let partition_u32: Vec<u32> = partition.iter().map(|&x| x as u32).collect();
let _ = save_solution(&Solution { partition: partition_u32 });
return Ok(());
}
pub fn help() {
println!("No help information available.");
}

View File

@ -2,7 +2,8 @@
// c005_a002
// c005_a003
pub mod hyper_improved;
pub use hyper_improved as c005_a003;
// c005_a004