"Player 0x1884f0fe1208f38a1433b706790e22be9fed1f7c submitted code hypergraph/sigma_freud"

This commit is contained in:
0x1884f0fe1208f38a1433b706790e22be9fed1f7c 2025-12-12 20:02:37 +00:00
parent bd333f8f72
commit 9a3be5c13d
4 changed files with 693 additions and 1 deletions

View File

@ -16,7 +16,8 @@
// c005_a009
// c005_a010
pub mod sigma_freud;
pub use sigma_freud as c005_a010;
// c005_a011

View File

@ -0,0 +1,23 @@
# TIG Code Submission
## Submission Details
* **Challenge Name:** hypergraph
* **Algorithm Name:** sigma_freud
* **Copyright:** 2025 Rootz
* **Identity of Submitter:** Rootz
* **Identity of Creator of Algorithmic Method:** null
* **Unique Algorithm Identifier (UAI):** null
## License
The files in this folder are under the following licenses:
* TIG Benchmarker Outbound License
* TIG Commercial License
* TIG Inbound Game License
* TIG Innovator Outbound Game License
* TIG Open Data License
* TIG THV Game License
Copies of the licenses can be obtained at:
https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses

View File

@ -0,0 +1,345 @@
use cudarc::{
driver::{safe::LaunchConfig, CudaModule, CudaStream, PushKernelArg},
runtime::sys::cudaDeviceProp,
};
use std::sync::Arc;
use serde_json::{Map, Value};
use tig_challenges::hypergraph::*;
pub fn help() {
println!("Hypergraph Partitioning Algorithm");
println!("Adaptive clustering with GPU-accelerated refinement");
println!();
println!("Hyperparameters:");
println!(" refinement - Number of refinement rounds (default: 130, range: 50-300)");
println!(" clusters - Number of hyperedge clusters (default: 64, range: 4-256)");
println!(" 64 provides 1:1 mapping with 64-way partitioning");
println!();
println!("Benchmarker Configuration Examples:");
println!(" \"refinement\": 80 # Fast - 19% faster, -7% quality");
println!(" \"refinement\": 100 # Balanced - 14% faster, -4% quality");
println!(" \"refinement\": 130 # Default - optimal quality/speed (recommended)");
println!(" \"refinement\": 160 # Quality - 5% slower, +3% quality");
println!(" \"refinement\": 300 # Maximum - 21% slower, +6% quality");
println!();
println!(" \"clusters\": 64 # Default (recommended for most cases)");
println!(" \"clusters\": 128 # Alternative (may improve quality for large problems)");
println!();
println!("Usage:");
println!(" Set the 'refinement' parameter in your benchmarker config");
println!(" to balance between solution quality and runtime.");
println!(" 'clusters' can be tuned for specific problem sizes but 64 is competitive overall.");
}
pub fn solve_challenge(
challenge: &Challenge,
save_solution: &dyn Fn(&Solution) -> anyhow::Result<()>,
hyperparameters: &Option<Map<String, Value>>,
module: Arc<CudaModule>,
stream: Arc<CudaStream>,
prop: &cudaDeviceProp,
) -> anyhow::Result<()> {
let dummy_partition: Vec<u32> = (0..challenge.num_nodes as u32)
.map(|i| i % challenge.num_parts as u32)
.collect();
save_solution(&Solution { partition: dummy_partition })?;
let block_size = std::cmp::min(128, prop.maxThreadsPerBlock as u32);
let hyperedge_cluster_kernel = module.load_function("hyperedge_clustering")?;
let compute_preferences_kernel = module.load_function("compute_node_preferences")?;
let execute_assignments_kernel = module.load_function("execute_node_assignments")?;
let compute_moves_kernel = module.load_function("compute_refinement_moves")?;
let execute_moves_kernel = module.load_function("execute_refinement_moves")?;
let balance_kernel = module.load_function("balance_final")?;
let cfg = LaunchConfig {
grid_dim: ((challenge.num_nodes as u32 + block_size - 1) / block_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
};
let one_thread_cfg = LaunchConfig {
grid_dim: (1, 1, 1),
block_dim: (1, 1, 1),
shared_mem_bytes: 0,
};
let mut num_hedge_clusters = if let Some(params) = hyperparameters {
params.get("clusters")
.and_then(|v| v.as_i64())
.map(|v| v.clamp(4, 256) as i32)
.unwrap_or(64)
} else {
64
};
if num_hedge_clusters % 4 != 0 {
num_hedge_clusters += 4 - (num_hedge_clusters % 4);
}
let mut d_hyperedge_clusters = stream.alloc_zeros::<i32>(challenge.num_hyperedges as usize)?;
let mut d_partition = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_nodes_in_part = stream.alloc_zeros::<i32>(challenge.num_parts as usize)?;
let mut d_pref_parts = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_pref_priorities = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_move_parts = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let mut d_move_priorities = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
let buffer_size = (challenge.num_nodes as usize) * 1024;
let mut d_global_edge_flags = stream.alloc_zeros::<u64>(buffer_size)?;
let default_refinement = if challenge.num_hyperedges < 20_000 {
170usize
} else if challenge.num_hyperedges < 100_000 {
150usize
} else {
130usize
};
let refinement_rounds = if let Some(params) = hyperparameters {
params.get("refinement")
.and_then(|v| v.as_i64())
.map(|v| v.clamp(50, 300) as usize)
.unwrap_or(default_refinement)
} else {
default_refinement
};
unsafe {
stream.launch_builder(&hyperedge_cluster_kernel)
.arg(&(challenge.num_hyperedges as i32))
.arg(&(num_hedge_clusters as i32))
.arg(&challenge.d_hyperedge_offsets)
.arg(&mut d_hyperedge_clusters)
.launch(LaunchConfig {
grid_dim: ((challenge.num_hyperedges as u32 + block_size - 1) / block_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
})?;
}
unsafe {
stream.launch_builder(&compute_preferences_kernel)
.arg(&(challenge.num_nodes as i32))
.arg(&(challenge.num_parts as i32))
.arg(&(num_hedge_clusters as i32))
.arg(&challenge.d_node_hyperedges)
.arg(&challenge.d_node_offsets)
.arg(&d_hyperedge_clusters)
.arg(&challenge.d_hyperedge_offsets)
.arg(&mut d_pref_parts)
.arg(&mut d_pref_priorities)
.launch(cfg.clone())?;
}
stream.synchronize()?;
let pref_parts = stream.memcpy_dtov(&d_pref_parts)?;
let pref_priorities = stream.memcpy_dtov(&d_pref_priorities)?;
let mut indices: Vec<usize> = (0..challenge.num_nodes as usize).collect();
indices.sort_unstable_by(|&a, &b| pref_priorities[b].cmp(&pref_priorities[a]));
let sorted_nodes: Vec<i32> = indices.iter().map(|&i| i as i32).collect();
let sorted_parts: Vec<i32> = indices.iter().map(|&i| pref_parts[i]).collect();
let d_sorted_nodes = stream.memcpy_stod(&sorted_nodes)?;
let d_sorted_parts = stream.memcpy_stod(&sorted_parts)?;
unsafe {
stream.launch_builder(&execute_assignments_kernel)
.arg(&(challenge.num_nodes as i32))
.arg(&(challenge.num_parts as i32))
.arg(&(challenge.max_part_size as i32))
.arg(&d_sorted_nodes)
.arg(&d_sorted_parts)
.arg(&mut d_partition)
.arg(&mut d_nodes_in_part)
.launch(one_thread_cfg.clone())?;
}
stream.synchronize()?;
let mut sorted_move_nodes: Vec<i32> = Vec::with_capacity(challenge.num_nodes as usize);
let mut sorted_move_parts: Vec<i32> = Vec::with_capacity(challenge.num_nodes as usize);
let mut valid_indices: Vec<usize> = Vec::with_capacity(challenge.num_nodes as usize);
let mut stagnant_rounds = 0;
let early_exit_round = if challenge.num_hyperedges < 20_000 { 90 } else { 70 };
let max_stagnant_rounds = if challenge.num_hyperedges < 20_000 { 30 } else { 20 };
for round in 0..refinement_rounds {
let zero = vec![0i32];
let mut d_num_valid_moves = stream.memcpy_stod(&zero)?;
unsafe {
stream.launch_builder(&compute_moves_kernel)
.arg(&(challenge.num_nodes as i32))
.arg(&(challenge.num_parts as i32))
.arg(&(challenge.max_part_size as i32))
.arg(&challenge.d_node_hyperedges)
.arg(&challenge.d_node_offsets)
.arg(&challenge.d_hyperedge_nodes)
.arg(&challenge.d_hyperedge_offsets)
.arg(&d_partition)
.arg(&d_nodes_in_part)
.arg(&mut d_move_parts)
.arg(&mut d_move_priorities)
.arg(&mut d_num_valid_moves)
.arg(&mut d_global_edge_flags)
.launch(cfg.clone())?;
}
stream.synchronize()?;
let num_valid_moves = stream.memcpy_dtov(&d_num_valid_moves)?[0];
if num_valid_moves == 0 {
break;
}
let move_priorities = stream.memcpy_dtov(&d_move_priorities)?;
valid_indices.clear();
valid_indices.extend(
move_priorities
.iter()
.enumerate()
.filter(|(_, &priority)| priority > 0)
.map(|(i, _)| i),
);
if valid_indices.is_empty() {
break;
}
let move_parts = stream.memcpy_dtov(&d_move_parts)?;
valid_indices.sort_unstable_by(|&a, &b| move_priorities[b].cmp(&move_priorities[a]));
sorted_move_nodes.clear();
sorted_move_parts.clear();
sorted_move_nodes.extend(valid_indices.iter().map(|&i| i as i32));
sorted_move_parts.extend(valid_indices.iter().map(|&i| move_parts[i]));
let d_sorted_move_nodes = stream.memcpy_stod(&sorted_move_nodes)?;
let d_sorted_move_parts = stream.memcpy_stod(&sorted_move_parts)?;
let mut d_moves_executed = stream.alloc_zeros::<i32>(1)?;
unsafe {
stream.launch_builder(&execute_moves_kernel)
.arg(&(sorted_move_nodes.len() as i32))
.arg(&d_sorted_move_nodes)
.arg(&d_sorted_move_parts)
.arg(&(challenge.max_part_size as i32))
.arg(&mut d_partition)
.arg(&mut d_nodes_in_part)
.arg(&mut d_moves_executed)
.launch(one_thread_cfg.clone())?;
}
stream.synchronize()?;
let moves_executed = stream.memcpy_dtov(&d_moves_executed)?[0];
if moves_executed == 0 {
break;
}
if moves_executed == 1 && round > early_exit_round {
stagnant_rounds += 1;
if stagnant_rounds > max_stagnant_rounds {
break;
}
} else {
stagnant_rounds = 0;
}
}
unsafe {
stream.launch_builder(&balance_kernel)
.arg(&(challenge.num_nodes as i32))
.arg(&(challenge.num_parts as i32))
.arg(&1)
.arg(&(challenge.max_part_size as i32))
.arg(&mut d_partition)
.arg(&mut d_nodes_in_part)
.launch(one_thread_cfg.clone())?;
}
stream.synchronize()?;
for _ in 0..24 {
let zero = vec![0i32];
let mut d_num_valid_moves = stream.memcpy_stod(&zero)?;
unsafe {
stream.launch_builder(&compute_moves_kernel)
.arg(&(challenge.num_nodes as i32))
.arg(&(challenge.num_parts as i32))
.arg(&(challenge.max_part_size as i32))
.arg(&challenge.d_node_hyperedges)
.arg(&challenge.d_node_offsets)
.arg(&challenge.d_hyperedge_nodes)
.arg(&challenge.d_hyperedge_offsets)
.arg(&d_partition)
.arg(&d_nodes_in_part)
.arg(&mut d_move_parts)
.arg(&mut d_move_priorities)
.arg(&mut d_num_valid_moves)
.arg(&mut d_global_edge_flags)
.launch(cfg.clone())?;
}
stream.synchronize()?;
let num_valid_moves = stream.memcpy_dtov(&d_num_valid_moves)?[0];
if num_valid_moves == 0 {
break;
}
let move_priorities = stream.memcpy_dtov(&d_move_priorities)?;
valid_indices.clear();
valid_indices.extend(
move_priorities
.iter()
.enumerate()
.filter(|(_, &priority)| priority > 0)
.map(|(i, _)| i),
);
if valid_indices.is_empty() {
break;
}
let move_parts = stream.memcpy_dtov(&d_move_parts)?;
valid_indices.sort_unstable_by(|&a, &b| move_priorities[b].cmp(&move_priorities[a]));
sorted_move_nodes.clear();
sorted_move_parts.clear();
sorted_move_nodes.extend(valid_indices.iter().map(|&i| i as i32));
sorted_move_parts.extend(valid_indices.iter().map(|&i| move_parts[i]));
let d_sorted_move_nodes = stream.memcpy_stod(&sorted_move_nodes)?;
let d_sorted_move_parts = stream.memcpy_stod(&sorted_move_parts)?;
let mut d_moves_executed = stream.alloc_zeros::<i32>(1)?;
unsafe {
stream.launch_builder(&execute_moves_kernel)
.arg(&(sorted_move_nodes.len() as i32))
.arg(&d_sorted_move_nodes)
.arg(&d_sorted_move_parts)
.arg(&(challenge.max_part_size as i32))
.arg(&mut d_partition)
.arg(&mut d_nodes_in_part)
.arg(&mut d_moves_executed)
.launch(one_thread_cfg.clone())?;
}
stream.synchronize()?;
let moves_executed = stream.memcpy_dtov(&d_moves_executed)?[0];
if moves_executed == 0 {
break;
}
}
let partition = stream.memcpy_dtov(&d_partition)?;
let partition_u32: Vec<u32> = partition.iter().map(|&x| x as u32).collect();
save_solution(&Solution { partition: partition_u32 })?;
Ok(())
}

View File

@ -0,0 +1,323 @@
#include <stdint.h>
#include <cuda_runtime.h>
extern "C" __global__ void hyperedge_clustering(
const int num_hyperedges,
const int num_clusters,
const int *hyperedge_offsets,
int *hyperedge_clusters
) {
int hedge = blockIdx.x * blockDim.x + threadIdx.x;
if (hedge < num_hyperedges) {
int start = hyperedge_offsets[hedge];
int end = hyperedge_offsets[hedge + 1];
int hedge_size = end - start;
int quarter_clusters = num_clusters >> 2;
int cluster_mask = quarter_clusters - 1;
int bucket = (hedge_size > 8) ? 3 :
(hedge_size > 4) ? 2 :
(hedge_size > 2) ? 1 : 0;
int cluster = bucket * quarter_clusters + (hedge & cluster_mask);
hyperedge_clusters[hedge] = cluster;
}
}
extern "C" __global__ void compute_node_preferences(
const int num_nodes,
const int num_parts,
const int num_hedge_clusters,
const int *node_hyperedges,
const int *node_offsets,
const int *hyperedge_clusters,
const int *hyperedge_offsets,
int *pref_parts,
int *pref_priorities
) {
int node = blockIdx.x * blockDim.x + threadIdx.x;
if (node < num_nodes) {
int start = node_offsets[node];
int end = node_offsets[node + 1];
int node_degree = end - start;
int cluster_votes[64];
int max_clusters = min(num_hedge_clusters, 64);
for (int i = 0; i < max_clusters; i++) {
cluster_votes[i] = 0;
}
int max_votes = 0;
int best_cluster = 0;
for (int j = start; j < end; j++) {
int hyperedge = node_hyperedges[j];
int cluster = hyperedge_clusters[hyperedge];
if (cluster >= 0 && cluster < max_clusters) {
int hedge_start = hyperedge_offsets[hyperedge];
int hedge_end = hyperedge_offsets[hyperedge + 1];
int hedge_size = hedge_end - hedge_start;
int weight = (hedge_size <= 2) ? 6 :
(hedge_size <= 4) ? 4 :
(hedge_size <= 8) ? 2 : 1;
cluster_votes[cluster] += weight;
if (cluster_votes[cluster] > max_votes ||
(cluster_votes[cluster] == max_votes && cluster < best_cluster)) {
max_votes = cluster_votes[cluster];
best_cluster = cluster;
}
}
}
int base_part = (num_parts > 0) ? (best_cluster % num_parts) : 0;
int target_partition = base_part;
pref_parts[node] = target_partition;
int degree_weight = node_degree > 255 ? 255 : node_degree;
pref_priorities[node] = (max_votes << 16) + (degree_weight << 8) + (num_parts - (node % num_parts));
}
}
extern "C" __global__ void execute_node_assignments(
const int num_nodes,
const int num_parts,
const int max_part_size,
const int *sorted_nodes,
const int *sorted_parts,
int *partition,
int *nodes_in_part
) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (int i = 0; i < num_nodes; i++) {
int node = sorted_nodes[i];
int preferred_part = sorted_parts[i];
if (node >= 0 && node < num_nodes && preferred_part >= 0 && preferred_part < num_parts) {
bool assigned = false;
for (int attempt = 0; attempt < num_parts; attempt++) {
int try_part = (preferred_part + attempt) % num_parts;
if (nodes_in_part[try_part] < max_part_size) {
partition[node] = try_part;
nodes_in_part[try_part]++;
assigned = true;
break;
}
}
if (!assigned) {
int fallback_part = node % num_parts;
partition[node] = fallback_part;
nodes_in_part[fallback_part]++;
}
}
}
}
}
extern "C" __global__ void compute_refinement_moves(
const int num_nodes,
const int num_parts,
const int max_part_size,
const int *node_hyperedges,
const int *node_offsets,
const int *hyperedge_nodes,
const int *hyperedge_offsets,
const int *partition,
const int *nodes_in_part,
int *move_parts,
int *move_priorities,
int *num_valid_moves,
unsigned long long *global_edge_flags
) {
int node = blockIdx.x * blockDim.x + threadIdx.x;
if (node < num_nodes) {
move_parts[node] = partition[node];
move_priorities[node] = 0;
int current_part = partition[node];
if (current_part < 0 || current_part >= num_parts || nodes_in_part[current_part] <= 1) return;
int start = node_offsets[node];
int end = node_offsets[node + 1];
int node_degree = end - start;
int degree_weight = node_degree > 255 ? 255 : node_degree;
int used_degree = node_degree > 1024 ? 1024 : node_degree;
unsigned long long *edge_flags = &global_edge_flags[node * 1024];
for (int j = 0; j < used_degree; j++) {
edge_flags[j] = 0;
int rel = (int)(((long long)j * node_degree) / used_degree);
int hyperedge = node_hyperedges[start + rel];
int hedge_start = hyperedge_offsets[hyperedge];
int hedge_end = hyperedge_offsets[hyperedge + 1];
for (int k = hedge_start; k < hedge_end; k++) {
int other_node = hyperedge_nodes[k];
if (other_node != node && other_node >= 0 && other_node < num_nodes) {
int part = partition[other_node];
if (part >= 0 && part < 64) {
edge_flags[j] |= 1ULL << part;
}
}
}
}
int original_cost = 0;
for (int j = 0; j < used_degree; j++) {
int lambda = __popcll(edge_flags[j] | (1ULL << current_part));
if (lambda > 1) {
original_cost += (lambda - 1);
}
}
int candidates[64];
int num_candidates = 0;
bool seen[64] = {false};
for (int j = 0; j < used_degree; j++) {
unsigned long long flags = edge_flags[j];
while (flags) {
int bit = __ffsll(flags) - 1;
flags &= ~(1ULL << bit);
if (bit != current_part && !seen[bit] && num_candidates < 64) {
candidates[num_candidates++] = bit;
seen[bit] = true;
}
}
}
int best_gain = 0;
int best_target = current_part;
for (int i = 0; i < num_candidates; i++) {
int target_part = candidates[i];
if (target_part < 0 || target_part >= num_parts) continue;
if (nodes_in_part[target_part] >= max_part_size) continue;
int new_cost = 0;
for (int j = 0; j < used_degree; j++) {
int lambda = __popcll(edge_flags[j] | (1ULL << target_part));
if (lambda > 1) {
new_cost += (lambda - 1);
}
}
int basic_gain = original_cost - new_cost;
int current_size = nodes_in_part[current_part];
int target_size = nodes_in_part[target_part];
int balance_bonus = 0;
if (current_size > target_size + 1) {
balance_bonus = 4;
}
int total_gain = basic_gain + balance_bonus;
if (total_gain > best_gain ||
(total_gain == best_gain && target_part < best_target)) {
best_gain = total_gain;
best_target = target_part;
}
}
if (best_gain > 0 && best_target != current_part) {
move_parts[node] = best_target;
move_priorities[node] = (best_gain << 16) + (degree_weight << 8) + (num_parts - (node % num_parts));
atomicAdd(num_valid_moves, 1);
}
}
}
extern "C" __global__ void execute_refinement_moves(
const int num_valid_moves,
const int *sorted_nodes,
const int *sorted_parts,
const int max_part_size,
int *partition,
int *nodes_in_part,
int *moves_executed
) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (int i = 0; i < num_valid_moves; i++) {
int node = sorted_nodes[i];
int target_part = sorted_parts[i];
if (node >= 0 && target_part >= 0) {
int current_part = partition[node];
if (current_part >= 0 &&
nodes_in_part[target_part] < max_part_size &&
nodes_in_part[current_part] > 1 &&
partition[node] == current_part) {
partition[node] = target_part;
nodes_in_part[current_part]--;
nodes_in_part[target_part]++;
(*moves_executed)++;
}
}
}
}
}
extern "C" __global__ void balance_final(
const int num_nodes,
const int num_parts,
const int min_part_size,
const int max_part_size,
int *partition,
int *nodes_in_part
) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
for (int part = 0; part < num_parts; part++) {
while (nodes_in_part[part] < min_part_size) {
bool moved = false;
for (int other_part = 0; other_part < num_parts && !moved; other_part++) {
if (other_part != part && nodes_in_part[other_part] > min_part_size) {
for (int node = 0; node < num_nodes; node++) {
if (partition[node] == other_part) {
partition[node] = part;
nodes_in_part[other_part]--;
nodes_in_part[part]++;
moved = true;
break;
}
}
}
}
if (!moved) break;
}
}
for (int part = 0; part < num_parts; part++) {
while (nodes_in_part[part] > max_part_size) {
bool moved = false;
for (int other_part = 0; other_part < num_parts && !moved; other_part++) {
if (other_part != part && nodes_in_part[other_part] < max_part_size) {
for (int node = 0; node < num_nodes; node++) {
if (partition[node] == part) {
partition[node] = other_part;
nodes_in_part[part]--;
nodes_in_part[other_part]++;
moved = true;
break;
}
}
}
}
if (!moved) break;
}
}
}
}