From bb2bc7f6158044ee9d2efea73c7eaf5f8c612e7b Mon Sep 17 00:00:00 2001
From: FiveMovesAhead <ying@tig.foundation>
Date: Tue, 2 Dec 2025 16:54:19 +0000
Subject: [PATCH] Submitted hypergraph/hyper_improved

---
 .../src/hypergraph/hyper_improved/README.md   |  23 +
 .../src/hypergraph/hyper_improved/kernels.cu  | 395 ++++++++++++++++++
 .../src/hypergraph/hyper_improved/mod.rs      | 240 +++++++++++
 tig-algorithms/src/hypergraph/mod.rs          |   3 +-
 4 files changed, 660 insertions(+), 1 deletion(-)
 create mode 100644 tig-algorithms/src/hypergraph/hyper_improved/README.md
 create mode 100644 tig-algorithms/src/hypergraph/hyper_improved/kernels.cu
 create mode 100644 tig-algorithms/src/hypergraph/hyper_improved/mod.rs

diff --git a/tig-algorithms/src/hypergraph/hyper_improved/README.md b/tig-algorithms/src/hypergraph/hyper_improved/README.md
new file mode 100644
index 00000000..e530ff36
--- /dev/null
+++ b/tig-algorithms/src/hypergraph/hyper_improved/README.md
@@ -0,0 +1,23 @@
+# TIG Code Submission
+
+## Submission Details
+
+* **Challenge Name:** hypergraph
+* **Algorithm Name:** hyper_improved
+* **Copyright:** 2025 Rootz
+* **Identity of Submitter:** Rootz
+* **Identity of Creator of Algorithmic Method:** null
+* **Unique Algorithm Identifier (UAI):** null
+
+## License
+
+The files in this folder are under the following licenses:
+* TIG Benchmarker Outbound License
+* TIG Commercial License
+* TIG Inbound Game License
+* TIG Innovator Outbound Game License
+* TIG Open Data License
+* TIG THV Game License
+
+Copies of the licenses can be obtained at:
+https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses
\ No newline at end of file
diff --git a/tig-algorithms/src/hypergraph/hyper_improved/kernels.cu b/tig-algorithms/src/hypergraph/hyper_improved/kernels.cu
new file mode 100644
index 00000000..50426474
--- /dev/null
+++ b/tig-algorithms/src/hypergraph/hyper_improved/kernels.cu
@@ -0,0 +1,395 @@
+/*!Copyright 2025 Rootz
+
+Identity of Submitter Rootz
+
+UAI null
+
+Licensed under the TIG Inbound Game License v2.0 or (at your option) any later
+version (the "License"); you may not use this file except in compliance with the
+License. You may obtain a copy of the License at
+
+https://github.com/tig-foundation/tig-monorepo/tree/main/docs/licenses
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the specific
+language governing permissions and limitations under the License.
+*/
+#include <stdint.h>
+#include <cuda_runtime.h>
+
+extern "C" __global__ void hyperedge_clustering(
+    const int num_hyperedges,
+    const int num_clusters,
+    const int *hyperedge_nodes,
+    const int *hyperedge_offsets,
+    int *hyperedge_clusters
+) {
+    int hedge = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (hedge < num_hyperedges) {
+        int start = hyperedge_offsets[hedge];
+        int end = hyperedge_offsets[hedge + 1];
+        int hedge_size = end - start;
+        
+        int quarter_clusters = num_clusters >> 2;
+        int cluster_mask = quarter_clusters - 1;
+        
+        int cluster;
+        if (hedge_size <= 2) {
+            cluster = hedge & cluster_mask;
+        } else if (hedge_size <= 4) {
+            cluster = quarter_clusters + (hedge & cluster_mask);
+        } else if (hedge_size <= 8) {
+            cluster = (quarter_clusters << 1) + (hedge & cluster_mask);
+        } else {
+            cluster = (quarter_clusters * 3) + (hedge & cluster_mask);
+        }
+        
+        hyperedge_clusters[hedge] = cluster;
+    }
+}
+
+extern "C" __global__ void compute_node_preferences(
+    const int num_nodes,
+    const int num_parts,
+    const int num_hedge_clusters,
+    const int *node_hyperedges,
+    const int *node_offsets,
+    const int *hyperedge_clusters,
+    const int *hyperedge_offsets,
+    int *pref_nodes,
+    int *pref_parts,
+    int *pref_gains,
+    int *pref_priorities
+) {
+    int node = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (node < num_nodes) {
+        int start = node_offsets[node];
+        int end = node_offsets[node + 1];
+        int node_degree = end - start;
+        
+        int cluster_votes[256];
+        int max_clusters = min(num_hedge_clusters, 256);
+        for (int i = 0; i < max_clusters; i++) {
+            cluster_votes[i] = 0;
+        }
+        
+        int max_votes = 0;
+        int best_cluster = 0;
+        
+        for (int j = start; j < end; j++) {
+            int hyperedge = node_hyperedges[j];
+            int cluster = hyperedge_clusters[hyperedge];
+            
+            if (cluster >= 0 && cluster < max_clusters) {
+                int hedge_start = hyperedge_offsets[hyperedge];
+                int hedge_end = hyperedge_offsets[hyperedge + 1];
+                int hedge_size = hedge_end - hedge_start;
+                int weight = (hedge_size <= 3) ? 4 : (hedge_size <= 6) ? 2 : 1;
+                
+                cluster_votes[cluster] += weight;
+                
+                if (cluster_votes[cluster] > max_votes || 
+                    (cluster_votes[cluster] == max_votes && cluster < best_cluster)) {
+                    max_votes = cluster_votes[cluster];
+                    best_cluster = cluster;
+                }
+            }
+        }
+        
+        int target_partition;
+        if (node_degree <= 3) {
+            target_partition = (best_cluster + node) % num_parts;
+        } else if (node_degree <= 8) {
+            target_partition = (best_cluster + node_degree + node) % num_parts;
+        } else {
+            target_partition = (best_cluster * 2 + node_degree + node) % num_parts;
+        }
+        
+        pref_nodes[node] = node;
+        pref_parts[node] = target_partition;
+        pref_gains[node] = max_votes;
+        pref_priorities[node] = (max_votes << 16) + (num_parts - (node % num_parts));
+    }
+}
+
+extern "C" __global__ void execute_node_assignments(
+    const int num_nodes,
+    const int num_parts,
+    const int max_part_size,
+    const int *sorted_nodes,
+    const int *sorted_parts,
+    int *partition,
+    int *nodes_in_part
+) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        for (int i = 0; i < num_nodes; i++) {
+            int node = sorted_nodes[i];
+            int preferred_part = sorted_parts[i];
+            
+            if (node >= 0 && node < num_nodes && preferred_part >= 0 && preferred_part < num_parts) {
+                bool assigned = false;
+                for (int attempt = 0; attempt < num_parts; attempt++) {
+                    int try_part = (preferred_part + attempt) % num_parts;
+                    if (nodes_in_part[try_part] < max_part_size) {
+                        partition[node] = try_part;
+                        nodes_in_part[try_part]++;
+                        assigned = true;
+                        break;
+                    }
+                }
+                
+                if (!assigned) {
+                    int fallback_part = node % num_parts;
+                    partition[node] = fallback_part;
+                    nodes_in_part[fallback_part]++;
+                }
+            }
+        }
+    }
+}
+
+extern "C" __global__ void compute_refinement_moves(
+    const int num_nodes,
+    const int num_parts,
+    const int max_part_size,
+    const int num_hyperedges,
+    const int *node_hyperedges,
+    const int *node_offsets,
+    const int *hyperedge_nodes,
+    const int *hyperedge_offsets,
+    const int *partition,
+    const int *nodes_in_part,
+    int *move_nodes,
+    int *move_parts,
+    int *move_gains,
+    int *move_priorities,
+    int *num_valid_moves,
+    const int round,
+    unsigned long long *global_edge_flags_low,
+    unsigned long long *global_edge_flags_high
+) {
+    int node = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (node < num_nodes) {
+        move_nodes[node] = node;
+        move_parts[node] = partition[node];
+        move_gains[node] = 0;
+        move_priorities[node] = 0;
+        
+        int current_part = partition[node];
+        if (current_part < 0 || current_part >= num_parts || nodes_in_part[current_part] <= 1) return;
+        
+        int start = node_offsets[node];
+        int end = node_offsets[node + 1];
+        int node_degree = end - start;
+        
+        if (node_degree > 3000) return;
+        
+        bool use_dual_buffer = (num_parts > 64);
+        
+        unsigned long long *edge_flags_low = &global_edge_flags_low[node * 3000];
+        unsigned long long *edge_flags_high = &global_edge_flags_high[node * 3000];
+        
+        for (int j = 0; j < node_degree; j++) {
+            edge_flags_low[j] = 0;
+            if (use_dual_buffer) {
+                edge_flags_high[j] = 0;
+            }
+            
+            int hyperedge = node_hyperedges[start + j];
+            int hedge_start = hyperedge_offsets[hyperedge];
+            int hedge_end = hyperedge_offsets[hyperedge + 1];
+            
+            for (int k = hedge_start; k < hedge_end; k++) {
+                int other_node = hyperedge_nodes[k];
+                if (other_node != node && other_node >= 0 && other_node < num_nodes) {
+                    int part = partition[other_node];
+                    if (part >= 0 && part < num_parts) {
+                        if (use_dual_buffer) {
+                            if (part < 64) {
+                                edge_flags_low[j] |= 1ULL << part;
+                            } else {
+                                edge_flags_high[j] |= 1ULL << (part - 64);
+                            }
+                        } else {
+                            if (part < min(num_parts, 64)) {
+                                edge_flags_low[j] |= 1ULL << part;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        
+        int original_cost = 0;
+        for (int j = 0; j < node_degree; j++) {
+            unsigned long long current_low = edge_flags_low[j];
+            unsigned long long current_high = use_dual_buffer ? edge_flags_high[j] : 0;
+            
+            int lambda;
+            if (use_dual_buffer) {
+                if (current_part < 64) {
+                    current_low |= 1ULL << current_part;
+                } else {
+                    current_high |= 1ULL << (current_part - 64);
+                }
+                lambda = __popcll(current_low) + __popcll(current_high);
+            } else {
+                lambda = __popcll(current_low | (1ULL << current_part));
+            }
+            
+            if (lambda > 1) {
+                original_cost += (lambda - 1);
+            }
+        }
+        
+        int best_gain = 0;
+        int best_target = current_part;
+        
+        for (int offset = 0; offset < num_parts; offset++) {
+            int target_part = (node + round + offset) % num_parts;
+            if (target_part == current_part) continue;
+            if (target_part < 0 || target_part >= num_parts) continue;
+            if (nodes_in_part[target_part] >= max_part_size) continue;
+            
+            int new_cost = 0;
+            for (int j = 0; j < node_degree; j++) {
+                unsigned long long target_low = edge_flags_low[j];
+                unsigned long long target_high = use_dual_buffer ? edge_flags_high[j] : 0;
+                
+                int lambda;
+                if (use_dual_buffer) {
+                    if (target_part < 64) {
+                        target_low |= 1ULL << target_part;
+                    } else {
+                        target_high |= 1ULL << (target_part - 64);
+                    }
+                    lambda = __popcll(target_low) + __popcll(target_high);
+                } else {
+                    lambda = __popcll(target_low | (1ULL << target_part));
+                }
+                
+                if (lambda > 1) {
+                    new_cost += (lambda - 1);
+                }
+            }
+            
+            int basic_gain = original_cost - new_cost;
+            
+            int current_size = nodes_in_part[current_part];
+            int target_size = nodes_in_part[target_part];
+            int balance_bonus = 0;
+            
+            if (current_size > target_size + 1) {
+                if (num_parts >= 120) {
+                    balance_bonus = 2;
+                } else if (num_parts >= 100) {
+                    balance_bonus = 3;
+                } else {
+                    balance_bonus = 4;
+                }
+            }
+            
+            int total_gain = basic_gain + balance_bonus;
+            
+            if (total_gain > best_gain || 
+                (total_gain == best_gain && target_part < best_target)) {
+                best_gain = total_gain;
+                best_target = target_part;
+            }
+        }
+        
+        if (best_gain > 0 && best_target != current_part) {
+            move_parts[node] = best_target;
+            move_gains[node] = best_gain;
+            move_priorities[node] = (best_gain << 16) + (num_parts - (node % num_parts));
+            atomicAdd(num_valid_moves, 1);
+        }
+    }
+}
+
+extern "C" __global__ void execute_refinement_moves(
+    const int num_valid_moves,
+    const int *sorted_nodes,
+    const int *sorted_parts,
+    const int max_part_size,
+    int *partition,
+    int *nodes_in_part,
+    int *moves_executed
+) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        for (int i = 0; i < num_valid_moves; i++) {
+            int node = sorted_nodes[i];
+            int target_part = sorted_parts[i];
+            
+            if (node >= 0 && target_part >= 0) {
+                int current_part = partition[node];
+                
+                if (current_part >= 0 &&
+                    nodes_in_part[target_part] < max_part_size && 
+                    nodes_in_part[current_part] > 1 &&
+                    partition[node] == current_part) {
+                    
+                    partition[node] = target_part;
+                    nodes_in_part[current_part]--;
+                    nodes_in_part[target_part]++;
+                    (*moves_executed)++;
+                }
+            }
+        }
+    }
+}
+
+extern "C" __global__ void balance_final(
+    const int num_nodes,
+    const int num_parts,
+    const int min_part_size,
+    const int max_part_size,
+    int *partition,
+    int *nodes_in_part
+) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        for (int part = 0; part < num_parts; part++) {
+            while (nodes_in_part[part] < min_part_size) {
+                bool moved = false;
+                for (int other_part = 0; other_part < num_parts && !moved; other_part++) {
+                    if (other_part != part && nodes_in_part[other_part] > min_part_size) {
+                        for (int node = 0; node < num_nodes; node++) {
+                            if (partition[node] == other_part) {
+                                partition[node] = part;
+                                nodes_in_part[other_part]--;
+                                nodes_in_part[part]++;
+                                moved = true;
+                                break;
+                            }
+                        }
+                    }
+                }
+                if (!moved) break;
+            }
+        }
+        
+        for (int part = 0; part < num_parts; part++) {
+            while (nodes_in_part[part] > max_part_size) {
+                bool moved = false;
+                for (int other_part = 0; other_part < num_parts && !moved; other_part++) {
+                    if (other_part != part && nodes_in_part[other_part] < max_part_size) {
+                        for (int node = 0; node < num_nodes; node++) {
+                            if (partition[node] == part) {
+                                partition[node] = other_part;
+                                nodes_in_part[part]--;
+                                nodes_in_part[other_part]++;
+                                moved = true;
+                                break;
+                            }
+                        }
+                    }
+                }
+                if (!moved) break;
+            }
+        }
+    }
+}
diff --git a/tig-algorithms/src/hypergraph/hyper_improved/mod.rs b/tig-algorithms/src/hypergraph/hyper_improved/mod.rs
new file mode 100644
index 00000000..d782f84a
--- /dev/null
+++ b/tig-algorithms/src/hypergraph/hyper_improved/mod.rs
@@ -0,0 +1,240 @@
+use cudarc::{
+    driver::{safe::LaunchConfig, CudaModule, CudaStream, PushKernelArg},
+    runtime::sys::cudaDeviceProp,
+};
+use std::sync::Arc;
+use serde_json::{Map, Value};
+use tig_challenges::hypergraph::*;
+
+
+pub fn solve_challenge(
+    challenge: &Challenge,
+    save_solution: &dyn Fn(&Solution) -> anyhow::Result<()>,
+    hyperparameters: &Option<Map<String, Value>>,
+    module: Arc<CudaModule>,
+    stream: Arc<CudaStream>,
+    prop: &cudaDeviceProp,
+) -> anyhow::Result<()> {
+    let block_size = std::cmp::min(256, prop.maxThreadsPerBlock as u32);
+    
+    let hyperedge_cluster_kernel = module.load_function("hyperedge_clustering")?;
+    let compute_preferences_kernel = module.load_function("compute_node_preferences")?;
+    let execute_assignments_kernel = module.load_function("execute_node_assignments")?;
+    let compute_moves_kernel = module.load_function("compute_refinement_moves")?;
+    let execute_moves_kernel = module.load_function("execute_refinement_moves")?;
+    let balance_kernel = module.load_function("balance_final")?;
+    
+    let cfg = LaunchConfig {
+        grid_dim: ((challenge.num_nodes as u32 + block_size - 1) / block_size, 1, 1),
+        block_dim: (block_size, 1, 1),
+        shared_mem_bytes: 0,
+    };
+    
+    let one_thread_cfg = LaunchConfig {
+        grid_dim: (1, 1, 1),
+        block_dim: (1, 1, 1),
+        shared_mem_bytes: 0,
+    };
+    
+    let num_hedge_clusters = if challenge.num_hyperedges < 50000 {
+        let thousands = challenge.num_hyperedges / 1000;
+        match thousands {
+            5 => 2, 6 => 2, 7 => 2, 8 => 2, 9 => 2, 10 => 2, 11 => 4, 12 => 6, 13 => 2, 14 => 2,
+            15 => 6, 16 => 2, 17 => 2, 18 => 2, 19 => 4, 20 => 2, 21 => 4, 22 => 4, 23 => 4, 24 => 2,
+            25 => 6, 26 => 2, 27 => 2, 28 => 2, 29 => 2, 30 => 2, 31 => 2, 32 => 2, 33 => 8, 34 => 8,
+            35 => 8, 36 => 4, 37 => 8, 38 => 4, 39 => 6, 40 => 2, 41 => 2, 42 => 2, 43 => 6, 44 => 2,
+            45 => 2, 46 => 2, 47 => 2, 48 => 2, 49 => 2,
+            _ => if thousands < 5 { 2 } else { 8 }
+        }
+    } else {
+        std::env::var("CLUSTER_SIZE").unwrap_or_else(|_| "8".to_string()).parse::<usize>().unwrap_or(8)
+    };
+    
+    let mut d_hyperedge_clusters = stream.alloc_zeros::<i32>(challenge.num_hyperedges as usize)?;
+    let mut d_partition = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
+    let mut d_nodes_in_part = stream.alloc_zeros::<i32>(challenge.num_parts as usize)?;
+    
+    let mut d_pref_nodes = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
+    let mut d_pref_parts = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
+    let mut d_pref_gains = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
+    let mut d_pref_priorities = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
+    
+    let mut d_move_nodes = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
+    let mut d_move_parts = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
+    let mut d_move_gains = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
+    let mut d_move_priorities = stream.alloc_zeros::<i32>(challenge.num_nodes as usize)?;
+    let mut d_num_valid_moves = stream.alloc_zeros::<i32>(1)?;
+    
+    let num_threads = ((challenge.num_nodes as u32 + block_size - 1) / block_size) * block_size;
+    let buffer_size = (num_threads * 3000) as usize;
+    let mut d_global_edge_flags_low = stream.alloc_zeros::<u64>(buffer_size)?;
+    
+    let mut d_global_edge_flags_high = if challenge.num_parts > 64 {
+        stream.alloc_zeros::<u64>(buffer_size)?
+    } else {
+        stream.alloc_zeros::<u64>(1)?
+    };
+    
+    unsafe {
+        stream.launch_builder(&hyperedge_cluster_kernel)
+            .arg(&(challenge.num_hyperedges as i32))
+            .arg(&(num_hedge_clusters as i32))
+            .arg(&challenge.d_hyperedge_nodes)
+            .arg(&challenge.d_hyperedge_offsets)
+            .arg(&mut d_hyperedge_clusters)
+            .launch(LaunchConfig {
+                grid_dim: ((challenge.num_hyperedges as u32 + block_size - 1) / block_size, 1, 1),
+                block_dim: (block_size, 1, 1),
+                shared_mem_bytes: 0,
+            })?;
+    }
+    stream.synchronize()?;
+    
+    unsafe {
+        stream.launch_builder(&compute_preferences_kernel)
+            .arg(&(challenge.num_nodes as i32))
+            .arg(&(challenge.num_parts as i32))
+            .arg(&(num_hedge_clusters as i32))
+            .arg(&challenge.d_node_hyperedges)
+            .arg(&challenge.d_node_offsets)
+            .arg(&d_hyperedge_clusters)
+            .arg(&challenge.d_hyperedge_offsets)
+            .arg(&mut d_pref_nodes)
+            .arg(&mut d_pref_parts)
+            .arg(&mut d_pref_gains)
+            .arg(&mut d_pref_priorities)
+            .launch(cfg.clone())?;
+    }
+    stream.synchronize()?;
+    
+    let pref_nodes = stream.memcpy_dtov(&d_pref_nodes)?;
+    let pref_parts = stream.memcpy_dtov(&d_pref_parts)?;
+    let pref_priorities = stream.memcpy_dtov(&d_pref_priorities)?;
+    
+    let mut indices: Vec<usize> = (0..challenge.num_nodes as usize).collect();
+    indices.sort_by(|&a, &b| pref_priorities[b].cmp(&pref_priorities[a]));
+    
+    let sorted_nodes: Vec<i32> = indices.iter().map(|&i| pref_nodes[i]).collect();
+    let sorted_parts: Vec<i32> = indices.iter().map(|&i| pref_parts[i]).collect();
+    
+    let d_sorted_nodes = stream.memcpy_stod(&sorted_nodes)?;
+    let d_sorted_parts = stream.memcpy_stod(&sorted_parts)?;
+    
+    unsafe {
+        stream.launch_builder(&execute_assignments_kernel)
+            .arg(&(challenge.num_nodes as i32))
+            .arg(&(challenge.num_parts as i32))
+            .arg(&(challenge.max_part_size as i32))
+            .arg(&d_sorted_nodes)
+            .arg(&d_sorted_parts)
+            .arg(&mut d_partition)
+            .arg(&mut d_nodes_in_part)
+            .launch(one_thread_cfg.clone())?;
+    }
+    stream.synchronize()?;
+    
+    let mut valid_moves: Vec<(i32, i32, i32)> = Vec::with_capacity(challenge.num_nodes as usize);
+    let mut sorted_move_nodes: Vec<i32> = Vec::with_capacity(challenge.num_nodes as usize);
+    let mut sorted_move_parts: Vec<i32> = Vec::with_capacity(challenge.num_nodes as usize);
+    
+    for round in 0..100 {
+        unsafe {
+            stream.launch_builder(&compute_moves_kernel)
+                .arg(&(challenge.num_nodes as i32))
+                .arg(&(challenge.num_parts as i32))
+                .arg(&(challenge.max_part_size as i32))
+                .arg(&(challenge.num_hyperedges as i32))
+                .arg(&challenge.d_node_hyperedges)
+                .arg(&challenge.d_node_offsets)
+                .arg(&challenge.d_hyperedge_nodes)
+                .arg(&challenge.d_hyperedge_offsets)
+                .arg(&d_partition)
+                .arg(&d_nodes_in_part)
+                .arg(&mut d_move_nodes)
+                .arg(&mut d_move_parts)
+                .arg(&mut d_move_gains)
+                .arg(&mut d_move_priorities)
+                .arg(&mut d_num_valid_moves)
+                .arg(&round)
+                .arg(&mut d_global_edge_flags_low)
+                .arg(&mut d_global_edge_flags_high)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+        
+        let num_valid_moves = stream.memcpy_dtov(&d_num_valid_moves)?[0];
+        if num_valid_moves == 0 {
+            break;
+        }
+        
+        let move_gains = stream.memcpy_dtov(&d_move_gains)?;
+        let valid_indices: Vec<usize> = move_gains.iter().enumerate()
+            .filter(|(_, &gain)| gain > 0)
+            .map(|(i, _)| i)
+            .collect();
+        
+        if valid_indices.is_empty() {
+            break;
+        }
+        
+        let move_nodes = stream.memcpy_dtov(&d_move_nodes)?;
+        let move_parts = stream.memcpy_dtov(&d_move_parts)?;
+        let move_priorities = stream.memcpy_dtov(&d_move_priorities)?;
+        
+        valid_moves.clear();
+        for &i in &valid_indices {
+            valid_moves.push((move_nodes[i], move_parts[i], move_priorities[i]));
+        }
+        
+        valid_moves.sort_by(|a, b| b.2.cmp(&a.2));
+        
+        sorted_move_nodes.clear();
+        sorted_move_parts.clear();
+        sorted_move_nodes.extend(valid_moves.iter().map(|&(node, _, _)| node));
+        sorted_move_parts.extend(valid_moves.iter().map(|&(_, part, _)| part));
+        
+        let d_sorted_move_nodes = stream.memcpy_stod(&sorted_move_nodes)?;
+        let d_sorted_move_parts = stream.memcpy_stod(&sorted_move_parts)?;
+        let mut d_moves_executed = stream.alloc_zeros::<i32>(1)?;
+        
+        unsafe {
+            stream.launch_builder(&execute_moves_kernel)
+                .arg(&(sorted_move_nodes.len() as i32))
+                .arg(&d_sorted_move_nodes)
+                .arg(&d_sorted_move_parts)
+                .arg(&(challenge.max_part_size as i32))
+                .arg(&mut d_partition)
+                .arg(&mut d_nodes_in_part)
+                .arg(&mut d_moves_executed)
+                .launch(one_thread_cfg.clone())?;
+        }
+        stream.synchronize()?;
+        
+        let moves_executed = stream.memcpy_dtov(&d_moves_executed)?[0];
+        if moves_executed == 0 {
+            break;
+        }
+    }
+    
+    unsafe {
+        stream.launch_builder(&balance_kernel)
+            .arg(&(challenge.num_nodes as i32))
+            .arg(&(challenge.num_parts as i32))
+            .arg(&1)
+            .arg(&(challenge.max_part_size as i32))
+            .arg(&mut d_partition)
+            .arg(&mut d_nodes_in_part)
+            .launch(one_thread_cfg.clone())?;
+    }
+    stream.synchronize()?;
+    
+    let partition = stream.memcpy_dtov(&d_partition)?;
+    let partition_u32: Vec<u32> = partition.iter().map(|&x| x as u32).collect();
+    
+    let _ = save_solution(&Solution { partition: partition_u32 });
+    return Ok(());
+}
+
+pub fn help() {
+    println!("No help information available.");
+}
diff --git a/tig-algorithms/src/hypergraph/mod.rs b/tig-algorithms/src/hypergraph/mod.rs
index 9beec594..e205b449 100644
--- a/tig-algorithms/src/hypergraph/mod.rs
+++ b/tig-algorithms/src/hypergraph/mod.rs
@@ -2,7 +2,8 @@
 
 // c005_a002
 
-// c005_a003
+pub mod hyper_improved;
+pub use hyper_improved as c005_a003;
 
 // c005_a004