From e306f8f97ebaff741828e42ede0568898ff061a4 Mon Sep 17 00:00:00 2001
From: FiveMovesAhead <ying@tig.foundation>
Date: Tue, 6 May 2025 21:11:02 +0100
Subject: [PATCH] Add hypergraph partitioning challenge.

---
 docs/challenges/hypergraph.md    | 144 +++++++
 tig-breakthroughs/evidence.md    |   2 +-
 tig-challenges/README.md         |   1 +
 tig-challenges/src/hypergraph.cu | 699 +++++++++++++++++++++++++++++++
 tig-challenges/src/hypergraph.rs | 514 +++++++++++++++++++++++
 5 files changed, 1359 insertions(+), 1 deletion(-)
 create mode 100644 docs/challenges/hypergraph.md
 create mode 100644 tig-challenges/src/hypergraph.cu
 create mode 100644 tig-challenges/src/hypergraph.rs

diff --git a/docs/challenges/hypergraph.md b/docs/challenges/hypergraph.md
new file mode 100644
index 0000000..65351f9
--- /dev/null
+++ b/docs/challenges/hypergraph.md
@@ -0,0 +1,144 @@
+## Overview
+[A hypergraph is a generalization of a graph where edges can connect more than just two vertices. Hypergraph partitioning is a technique used to assign the vertices of a hypergraph into separate groups (partitions) with the goal of minimizing the connections (hyperedges) linking these partitions. This is important for various applications, including parallel computing, VLSI design, and data analysis.](https://en.wikipedia.org/wiki/Hypergraph)  
+
+## Challenge Overview
+
+For our challenge, we use a version of the hypergraph partitioning problem with configurable difficulty, where the following two parameters can be adjusted in order to vary the difficulty of the challenge:
+
+- Parameter 1:  $num\textunderscore{ }hyperedges$ is the number of hyperedges in the hypergraph. 
+- Parameter 2: $better\textunderscore{ }than\textunderscore{ }baseline \geq 1$ (see Our Challenge)
+
+A hypergraph is a structure made up of:
+* Nodes, each belonging to one or more hyperedges.
+* Hyperedges, each containing one or more nodes.
+
+TIG's generation method is such that:
+* The weight/cost of nodes and hyperedges are fixed at 1 (in some variants costs can be different)
+* The number of nodes is around 92% the number of hypedges (i.e. if there are 100 hyperedges, there are around 92 nodes).
+* The number of hyperedges that a node belongs to follows a [power law distribution](https://en.wikipedia.org/wiki/Power_law)
+* The number of nodes contained by a hyperedge follows a [power law distribution](https://en.wikipedia.org/wiki/Power_law)
+
+**Objective:**
+
+The goal is deceptively simple: each node must be assigned to one of 64 parts (i.e. 64-way partition).
+
+A partition is scored by connectivity metric, where the connectivity of each hyperedge is the number of parts it intersects:
+
+```
+connectivity_metric = 0
+for each hyperedge:
+    intersected = set(
+        partition[node] # contains the id of the part a node is assigned to
+        for node in hyperedge
+    )
+    connectivity = len(intersected)
+    connectivity_metric += connectivity - 1
+```
+
+The lower the connectivity metric, the better the partition.
+
+**Constraints:**
+1. Each node must be assigned to one part.
+2. Every part must contain at least one node.
+3. The number of nodes assigned to each part cannot be larger than 1.03x the average:
+```
+average_size = num_nodes / num_hyperedges
+max_size = ceil(average_size * 1.03)
+for part in partition:
+    len(part) <= max_size
+```
+
+## Example
+
+Consider an example instance with `num_hyperedges = 16` and `num_nodes = 14`:
+
+```
+Edge ID: SIZE: NODES:
+      0     2      8, 11
+      1    12      0, 1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13
+      2     2      8, 9
+      3     8      0, 1, 2, 3, 4, 7, 8, 11
+      4     4      8, 9, 10, 11
+      5     1      13
+      6     4      4, 5, 6, 7
+      7     1      12
+      8     9      1, 2, 4, 6, 7, 8, 9, 10, 11
+      9     2      12, 13
+     10     2      12, 13
+     11     2      1, 2
+     12     4      8, 12, 13
+     13    10      0, 1, 2, 3, 4, 7, 8, 9, 10, 11
+     14     4      0, 1, 2, 3
+     15     3      8, 9, 10
+
+baseline_connectivity_metric = 26
+```
+
+Now consider the following partition:
+```
+partition = [1, 3, 3, 0, 2, 0, 0, 2, 3, 2, 1, 2, 1, 1]
+# nodes in part 0: [3, 5, 6]
+# nodes in part 1: [0, 10, 12, 13] 
+# nodes in part 2: [4, 7, 9, 11] 
+# nodes in part 3: [1, 2, 8]
+```
+
+Evaluating the connectivity of each hyperedge:
+```
+Hyperedge ID:      0   1   2   3   4   5   6   7   8   9   10  11  12  13  14  15
+Connectivity - 1:  1   3   1   3   2   0   1   0   3   0    2   0   1   3   2   2
+
+connectivity_metric = 24
+
+# explanation: 
+hyperedge 4 contains nodes [8, 9, 10, 11], overlapping with 3 parts (connectivity = 3)
+   node 8 is in part 3
+   node 9 and 11 is in part 2
+   node 10 is in part 1
+```
+
+This partition is ~7.7% better than the baseline:
+```
+better_than_baseline = 1 - connectivity_metric / baseline_connectivity_metric
+                     = 1 - 24 / 26
+                     = 0.077
+```
+
+ ## Our Challenge
+
+At TIG, the baseline connectivity is determined using a greedy bipartition approach. The nodes are ordered by degree, then at each bipartition, nodes are assigned to the left or right part based on the number of hyperedges in common with the nodes already in each part. This process is repeated until the desired number of partitions is reached (eg: 64). 
+
+Each instance of TIG's hypergraph partitioning problem contains 16 random sub-instances, each with its own baseline connectivity metric. For each sub-instance, we calculate how much your connectivity metric is better than the baseline connectivity metric, expressed as a percentage improvement. This improvement percentage is called `better_than_baseline`. Your overall performance is measured by taking the root mean square of these 16 `better_than_baseline` percentages. To pass a difficulty level, this overall score must meet or exceed the specified difficulty target.
+
+For precision, `better_than_baseline` is stored as an integer where each unit represents 0.1%. For example, a `better_than_baseline` value of 22 corresponds to 22/1000 = 2.2%.
+
+## Applications
+
+Hypergraphs are a powerful tool for representing complex networks in which relationships may involve more than two elements simultaneously. Hypergraph partitioning refers to dividing such a network into a specified number of groups that are roughly equal in size while keeping as many related items together as possible. Although the problem is computationally challenging (NP-hard), it has broad applications across numerous fields:
+
+* **Parallel Computing & Load Balancing:** By intelligently distributing tasks across processors, hypergraph partitioning minimizes communication overhead and enhances overall computational efficiency [^1][^2][^3][^4][^5].
+* **Distributed Neural Network Training:** It enables the partitioning of compute graphs across multiple GPUs or servers, significantly accelerating the training of deep learning models [^6][^7].
+* **VLSI & Circuit Design:** By effectively grouping circuit components, it optimizes chip layouts and reduces interconnect complexity, leading to faster and more efficient designs [^8][^9].
+* **Social Networks & Community Detection:** Capturing multi-way relationships, hypergraph partitioning reveals hidden community structures and provides deeper insights into group dynamics [^10].
+* **Bioinformatics & Computational Biology:** It facilitates the clustering of proteins, genes, and genomic regions to identify functional modules, thereby aiding discovery in biological research [^11].
+* **Machine Learning & Data Mining:** By effectively modeling higher-order interactions, it improves data clustering and feature selection, enhancing analytical outcomes [^12].
+* **Other Applications:** From optimizing database sharding and segmenting GIS regions to modularizing software systems, hypergraph partitioning transforms large-scale challenges into more tractable problems [^1][^7][^4].
+
+In the rapidly evolving field of Decentralized Physical Infrastructure Networks (DePIN) — which leverage blockchain technology and distributed nodes to manage physical assets — hypergraph partitioning plays an especially important role. By accurately modeling complex interactions, it can effectively group related tasks and resources across scenarios such as decentralized compute/storage, blockchain data sharding, IoT networks, or supply chain logistics [^16]. This grouping helps minimize cross-node communication and balances workloads, ultimately enhancing the scalability and performance of these decentralized systems [^15].
+
+[^1]: Devine, K.D., Boman, E.G., Heaphy, R.T., Bisseling, R.H., & Catalyurek, U.V. (2006). *Parallel hypergraph partitioning for scientific computing*. Proceedings 20th IEEE International Parallel & Distributed Processing Symposium.
+[^2]:  Aykanat, C., Cambazoglu, B., & Uçar, B. (2008). *Multi-level direct K-way hypergraph partitioning with multiple constraints and fixed vertices*. Journal of Parallel and Distributed Computing, 68, 609–625.
+[^3]: Trifunovic, A., & Knottenbelt, W. (2008). *Parallel multilevel algorithms for hypergraph partitioning*. J. Parallel Distrib. Comput., 68, 563–581.
+[^4]: Gottesbüren, L., & Hamann, M. (2022). *Deterministic Parallel Hypergraph Partitioning*. In Euro-Par 2022: Parallel Processing (pp. 301–316). Springer International Publishing.
+[^5]: Schlag, S., Heuer, T., Gottesbüren, L., Akhremtsev, Y., Schulz, C., & Sanders, P. (2023). *High-Quality Hypergraph Partitioning*. ACM J. Exp. Algorithmics, 27(1.9), 39. 
+[^6]: Zheng, D., Song, X., Yang, C., LaSalle, D., & Karypis, G. (2022). *Distributed Hybrid CPU and GPU Training for Graph Neural Networks on Billion-Scale Heterogeneous Graphs*. In Proceedings (pp. 4582–4591). [↩](https://chatgpt.com/c/67b36128-2270-8009-a6b5-411cb01de345#user-content-fnref-6)
+[^7]: Catalyurek, U., Devine, K., Fonseca Faraj, M., Gottesbüren, L., Heuer, T., Meyerhenke, H., Sanders, P., Schlag, S., Schulz, C., & Seemaier, D. (2022). *More Recent Advances in (Hyper)Graph Partitioning*. 
+[^8]: Papa, D., & Markov, I. (2007). *Hypergraph Partitioning and Clustering*. In Handbook of Approximation Algorithms and Metaheuristics. 
+[^9]: Karypis, G., Aggarwal, R., Kumar, V., & Shekhar, S. (1999). *Multilevel hypergraph partitioning: applications in VLSI domain*. IEEE Transactions on Very Large Scale Integration (VLSI) Systems, 7(1), 69–79. 
+[^10]: Zhang, C., Cheng, W., Li, F., & Wang, X. (2024). *Hypergraph-Based Influence Maximization in Online Social Networks*. Mathematics, 12(17), 2769. 
+[^11]: Wang, S., Cui, H., Qu, Y., & Yijia, Z. (2025). *Multi-source biological knowledge-guided hypergraph spatiotemporal subnetwork embedding for protein complex identification*. Briefings in Bioinformatics, 26. 
+[^12]: Zhou, D., Huang, J., & Schölkopf, B. (2006). *Learning with Hypergraphs: Clustering, Classification, and Embedding*. In Advances in Neural Information Processing Systems 19 (2006), 1601–1608. 
+[^13]: Chodrow, P.S., Veldt, N., & Benson, A.R. (2021). *Generative hypergraph clustering: From blockmodels to modularity*. Science Advances, 7.
+[^14]: Kolodziej, S., Mahmoudi Aznaveh, M., Bullock, M., David, J., Davis, T., Henderson, M., Hu, Y., & Sandstrom, R. (2019). *The SuiteSparse Matrix Collection Website Interface*. Journal of Open Source Software, 4, 1244.
+[^15]: K. Kumar et al. “SWORD: workload-aware data placement and replica selection for cloud data management systems”. In: The VLDB Journal 23 (Dec. 2014), pp. 845–870. doi: 10.1007/s00778-014-0362-1. 
+[^16]: Qu C, Tao M, Yuan R. A Hypergraph-Based Blockchain Model and Application in Internet of Things-Enabled Smart Homes. Sensors (Basel). 2018 Aug 24;18(9):2784. doi: 10.3390/s18092784. PMID: 30149523; PMCID: PMC6164253.
\ No newline at end of file
diff --git a/tig-breakthroughs/evidence.md b/tig-breakthroughs/evidence.md
index 3218dd9..4111d09 100644
--- a/tig-breakthroughs/evidence.md
+++ b/tig-breakthroughs/evidence.md
@@ -33,7 +33,7 @@ IT IS IMPORTANT THAT THIS SECTION IS COMPLETED IN SUFFICIENT DETAIL TO FULLY DES
 
 PLEASE IDENTIFY WHICH TIG CHALLENGE THE ALGORITHMIC METHOD ADDRESSES.
 
-> YOUR RESPONSE HERE (options are satisfiability, vehicle_routing, knapsack, or vector_search)
+> YOUR RESPONSE HERE (options are satisfiability, vehicle_routing, knapsack, vector_search, or hypergraph)
 
 PLEASE DESCRIBE THE ALGORITHMIC METHOD AND THE PROBLEM THAT IT SOLVES.
 
diff --git a/tig-challenges/README.md b/tig-challenges/README.md
index 79f69b3..46f9af7 100644
--- a/tig-challenges/README.md
+++ b/tig-challenges/README.md
@@ -7,6 +7,7 @@ Presently, TIG features four challenges:
 2. [Capacitated vehicle routing](../docs/challenges/vehicle_routing.md)
 3. [Knapsack problem](../docs/challenges/knapsack.md)
 4. [Vector range search](../docs/challenges/vector_search.md)
+5. [Hypergraph partitioning](../docs/challenges/hypergraph.md)
 
 Over the coming year, an additional seven challenges from domains including artificial intelligence, biology, medicine, and climate science will be phased in. 
 
diff --git a/tig-challenges/src/hypergraph.cu b/tig-challenges/src/hypergraph.cu
new file mode 100644
index 0000000..4911d6e
--- /dev/null
+++ b/tig-challenges/src/hypergraph.cu
@@ -0,0 +1,699 @@
+#include <curand_kernel.h>
+#include <stdint.h>
+#include <cuda_runtime.h>
+
+
+__device__ int select_level_based_on_weights(
+    const int num_levels, 
+    const float* level_weights, 
+    curandState* state
+)
+{
+    float total_weight = 0.0f;
+    for (int idx = 0; idx < num_levels; idx++)
+    {
+        total_weight += level_weights[idx];
+    }
+
+    float random_value = curand_uniform(state) * total_weight;
+    float cumulative = 0.0f;
+
+    for (int idx = 0; idx < num_levels; idx++)
+    {
+        cumulative += level_weights[idx];
+        if (random_value <= cumulative)
+        {
+            return idx;
+        }
+    }
+
+    return num_levels - 1;
+}
+
+__device__ void select_group(
+    const int level, 
+    const int num_nodes,
+    int *group,
+    int *num_groups,
+    curandState* state
+)
+{
+    *num_groups = 1 << level;  // 2^level
+    *group = curand(state) % *num_groups;
+}
+
+__device__ void get_group_bounds(
+    const int num_nodes, 
+    const int num_groups, 
+    const int group, 
+    int *start_idx, 
+    int *end_idx
+)
+{
+    int s = num_nodes / num_groups;
+    int r = num_nodes % num_groups;
+    if (group < r)
+    {
+        *start_idx = (s + 1) * group;
+        *end_idx = (s + 1) * (group + 1);
+    }
+    else
+    {
+        *start_idx = (s + 1) * r + s * (group - r);
+        *end_idx = (s + 1) * r + s * (group + 1 - r);
+    }
+}
+
+extern "C" __global__ void generate_hyperedge_sizes(
+    const uint8_t *seed,
+    const int num_hyperedges,
+    const uint min_hyperedge_size,
+    const uint max_hyperedge_size,
+    const float alpha,
+    int *hyperedge_sizes
+)
+{
+    for (int hyperedge_idx = threadIdx.x + blockIdx.x * blockDim.x; hyperedge_idx < num_hyperedges; hyperedge_idx += blockDim.x * gridDim.x) 
+    {   
+        curandState state;
+        curand_init(((uint64_t *)(seed))[0], hyperedge_idx, 0, &state);
+        
+        float y = curand_uniform(&state);
+
+        float c1 = powf((float)min_hyperedge_size, alpha);
+        float c2 = powf((float)max_hyperedge_size, alpha) - c1;
+        float x = powf(c2 * y + c1, 1.0f / alpha);
+
+        uint sample = (uint)floorf(x);
+        if (sample < min_hyperedge_size)
+        {
+            hyperedge_sizes[hyperedge_idx] = min_hyperedge_size;
+        }
+        else if (sample > max_hyperedge_size)
+        {
+            hyperedge_sizes[hyperedge_idx] = max_hyperedge_size;
+        }
+        else
+        {
+            hyperedge_sizes[hyperedge_idx] = sample;
+        }
+    }
+}
+
+extern "C" __global__ void generate_node_weights(
+    const uint8_t *seed,
+    const int num_nodes,
+    const float min_node_weight,
+    const float max_node_weight,
+    const float alpha,
+    float *node_weights
+)
+{    
+    for (int node_idx = threadIdx.x + blockIdx.x * blockDim.x; node_idx < num_nodes; node_idx += blockDim.x * gridDim.x) 
+    {
+        curandState state;
+        curand_init(((uint64_t *)(seed))[1], node_idx, 0, &state);
+     
+        float y = curand_uniform(&state);
+
+        float c1 = powf(min_node_weight, alpha);
+        float c2 = powf(max_node_weight, alpha) - c1;
+        float x = powf(c2 * y + c1, 1.0f / alpha);
+
+        float sample = floorf(x);
+        if (sample < min_node_weight)
+        {
+            node_weights[node_idx] = min_node_weight;
+        }
+        else if (sample > max_node_weight)
+        {
+            node_weights[node_idx] = max_node_weight;
+        }
+        else
+        {
+            node_weights[node_idx] = sample;
+        }
+    }
+}
+
+extern "C" __global__ void finalize_hyperedge_sizes(
+    const int num_hyperedges,
+    const int *hyperedge_sizes,
+    int *hyperedge_offsets,
+    uint *total_connections
+)
+{
+    hyperedge_offsets[0] = 0;
+    for (int idx = 0; idx < num_hyperedges; idx++)
+    {
+        hyperedge_offsets[idx+1] = hyperedge_offsets[idx] + hyperedge_sizes[idx];
+    }
+    *total_connections = hyperedge_offsets[num_hyperedges];
+}
+
+typedef struct
+{
+    int node_idx;
+    float key;
+} TrackedNode;
+
+__device__ void swap_nodes(TrackedNode *a, TrackedNode *b)
+{
+    TrackedNode temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+__device__ void swap_nodes(int *a, int *b)
+{
+    int temp = *a;
+    *a = *b;
+    *b = temp;
+}
+
+__device__ void heapify(TrackedNode *arr, const int idx, const int size)
+{
+    int smallest = idx;
+    int left = 2 * idx + 1;
+    int right = 2 * idx + 2;
+            
+    if (left < size && arr[left].key < arr[smallest].key)
+    {
+        smallest = left;
+    }
+            
+    if (right < size && arr[right].key < arr[smallest].key)
+    {
+        smallest = right;
+    }
+            
+    if (smallest != idx)
+    {
+        swap_nodes(&arr[idx], &arr[smallest]);
+        heapify(arr, smallest, size);
+    }
+}
+
+__device__ void heapify(int *arr, const int idx, const int size)
+{
+    int largest = idx;
+    int left = 2 * idx + 1;
+    int right = 2 * idx + 2;
+            
+    if (left < size && arr[left] > arr[largest])
+    {
+        largest = left;
+    }
+            
+    if (right < size && arr[right] > arr[largest])
+    {
+        largest = right;
+    }
+            
+    if (largest != idx)
+    {
+        swap_nodes(&arr[idx], &arr[largest]);
+        heapify(arr, largest, size);
+    }
+}
+
+__device__ void build_min_heap(TrackedNode *arr, const int size)
+{
+    for (int idx = size / 2 - 1; idx >= 0; idx--)
+    {
+        heapify(arr, idx, size);
+    }
+}
+
+__device__ int binary_search(
+    const int *arr,
+    const int size,
+    const int target
+)
+{
+    int left = 0;
+    int right = size - 1;
+
+    while (left <= right) {
+        int mid = left + (right - left) / 2;
+        if (arr[mid] == target) {
+            return mid;
+        } else if (arr[mid] < target) {
+            left = mid + 1;
+        } else {
+            right = mid - 1;
+        }
+    }
+
+    return -1;
+}
+
+extern "C" __global__ void generate_hyperedges(
+    const uint8_t *seed,
+    const int num_nodes,
+    const int num_hyperedges,
+    const uint total_connections,
+    const int *hyperedge_sizes,
+    const int *hyperedge_offsets,
+    const float *node_weights,
+    const float *level_weights,
+    int *hyperedge_nodes,
+    int *node_degrees
+)
+{
+    for (int hyperedge_idx = threadIdx.x + blockIdx.x * blockDim.x; hyperedge_idx < num_hyperedges; hyperedge_idx += blockDim.x * gridDim.x) 
+    {
+        curandState state;
+        curand_init(((uint64_t *)(seed))[2], hyperedge_idx, 0, &state);
+
+        int hyperhyperedge_size = hyperedge_sizes[hyperedge_idx];
+        int hyperedge_offset = hyperedge_offsets[hyperedge_idx];
+
+        int num_levels = (int)log2f((float)num_nodes / hyperhyperedge_size) + 1;
+        int level = select_level_based_on_weights(num_levels, level_weights, &state);
+        
+        int group, num_groups;
+        select_group(level, num_nodes, &group, &num_groups, &state);
+        
+        int start_idx, end_idx;
+        get_group_bounds(num_nodes, num_groups, group, &start_idx, &end_idx);
+        
+        TrackedNode reservoir[2000];
+        int group_size = end_idx - start_idx;
+
+        if (hyperhyperedge_size < 16)
+        {
+            for (int idx = 0; idx < group_size; idx++)
+            {
+                int node_idx = start_idx + idx;
+                float weight = node_weights[node_idx];
+                float key = powf(curand_uniform(&state), 1.0f / weight);
+                
+                if (idx < hyperhyperedge_size)
+                {
+                    reservoir[idx].node_idx = node_idx;
+                    reservoir[idx].key = key;
+                }
+                else
+                {
+                    int min_idx = 0;
+                    float min_key = reservoir[0].key;
+                    
+                    for (int j = 1; j < hyperhyperedge_size; j++)
+                    {
+                        if (reservoir[j].key < min_key)
+                        {
+                            min_key = reservoir[j].key;
+                            min_idx = j;
+                        }
+                    }
+                    
+                    if (key > min_key)
+                    {
+                        reservoir[min_idx].node_idx = node_idx;
+                        reservoir[min_idx].key = key;
+                    }
+                }
+            }
+        }
+        else
+        {
+            for (int idx = 0; idx < hyperhyperedge_size; idx++)
+            {
+                int node_idx = start_idx + idx;
+                float weight = node_weights[node_idx];
+                float key = powf(curand_uniform(&state), 1.0f / weight);
+                    
+                reservoir[idx].node_idx = node_idx;
+                reservoir[idx].key = key;
+            }
+            
+            build_min_heap(reservoir, hyperhyperedge_size);
+            for (int idx = hyperhyperedge_size; idx < group_size; idx++)
+            {
+                int node_idx = start_idx + idx;
+                float weight = node_weights[node_idx];
+                float key = powf(curand_uniform(&state), 1.0f / weight);
+                
+                if (key > reservoir[0].key)
+                {
+                    reservoir[0].node_idx = node_idx;
+                    reservoir[0].key = key;
+                    
+                    heapify(reservoir, 0, hyperhyperedge_size);
+                }
+            }
+        }
+
+        // Sort nodes for this hyperedge
+        for (int idx = 0; idx < hyperhyperedge_size; idx++)
+        {
+            int node_idx = reservoir[idx].node_idx;
+            hyperedge_nodes[hyperedge_offset + idx] = node_idx;
+            atomicAdd(&node_degrees[node_idx], 1);
+        }
+        for (int i = hyperhyperedge_size / 2 - 1; i >= 0; i--)
+        {
+            heapify(&hyperedge_nodes[hyperedge_offset], i, hyperhyperedge_size);
+        }
+
+        for (int i = hyperhyperedge_size - 1; i > 0; i--) 
+        {
+            swap_nodes(&hyperedge_nodes[hyperedge_offset], &hyperedge_nodes[hyperedge_offset + i]);
+            heapify(&hyperedge_nodes[hyperedge_offset], 0, i);
+        }
+    }
+}
+
+
+extern "C" __global__ void finalize_hyperedges(
+    const int num_nodes,
+    const int num_hyperedges,
+    const int *hyperedge_sizes,
+    const int *hyperedge_offsets,
+    const int *hyperedge_nodes,
+    const int *node_degrees,
+    int *node_hyperedges,
+    int *node_offsets
+)
+{
+    // Compute prefix sum for offsets
+    int running_sum = 0;
+    for (int v = 0; v < num_nodes; v++) 
+    {
+        if (threadIdx.x == 0) {
+            node_offsets[v] = running_sum;
+        }
+        running_sum += node_degrees[v];
+    }
+    if (threadIdx.x == 0) {
+        node_offsets[num_nodes] = running_sum;
+    }
+    __syncthreads();
+    
+    for (int node_idx = threadIdx.x + blockIdx.x * gridDim.x; node_idx < num_nodes; node_idx += blockDim.x * gridDim.x) 
+    {
+        int offset = 0;
+        for (int hyperedge_idx = 0; hyperedge_idx < num_hyperedges; hyperedge_idx++)
+        {
+            int start = hyperedge_offsets[hyperedge_idx];
+            if (binary_search(&hyperedge_nodes[start], hyperedge_sizes[hyperedge_idx], node_idx) != -1)
+            {
+                int insert_pos = node_offsets[node_idx] + offset;
+                node_hyperedges[insert_pos] = hyperedge_idx;
+                offset++;
+            }
+        }
+    }
+}
+
+extern "C" __global__ void initialize_partitioning(
+    const int num_nodes,
+    const int *node_degrees,
+    int *partition,
+    int *sorted_nodes
+) 
+{
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        // Use level 1 as initial partition
+        int start, end;
+        get_group_bounds(num_nodes, 2, 1, &start, &end);
+        for (int idx = 0; idx < num_nodes; idx++) 
+        {
+            if (node_degrees[idx] == 0) {
+                partition[idx] = -1;
+            } else {
+                partition[idx] = idx < start ? 1 : 2;
+            }
+        }
+    }
+
+    // compare each vertex with all others to find its sort idx
+    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < num_nodes; i += blockDim.x * gridDim.x) {
+        int pos = 0;
+        for (int j = 0; j < num_nodes; j++) {
+            if (i == j) {
+                continue;
+            }
+            if (
+                node_degrees[i] < node_degrees[j] || 
+                (node_degrees[i] == node_degrees[j] && i > j)
+            ) {
+                pos++;
+            }
+        }
+        sorted_nodes[pos] = i;
+    }
+}
+
+extern "C" __global__ void greedy_bipartition(
+    const int level,
+    const int num_nodes,
+    const int num_hyperedges,
+    const int *node_hyperedges,
+    const int *node_offsets,
+    const int *sorted_nodes,
+    const int *node_degrees,
+    int *partition,
+    unsigned long long *left_hyperedge_flags,
+    unsigned long long *right_hyperedge_flags
+) {
+    int p = (1 << level) + blockIdx.x - 1;
+
+    __shared__ int count;
+    if (threadIdx.x == 0) {
+        count = 0;
+    }
+    __syncthreads();
+    for (int v = threadIdx.x; v < num_nodes; v += blockDim.x) {
+        if (partition[v] == p) {
+            atomicAdd(&count, 1);
+        }
+    }
+    __syncthreads();
+    
+    if (count > 0) {
+        int size_left = count / 2;
+        int size_right = count - size_left;
+
+        __shared__ int left_count;
+        __shared__ int right_count;
+        __shared__ int connections_left;
+        __shared__ int connections_right;
+        if (threadIdx.x == 0) {
+            left_count = 0;
+            right_count = 0;
+        }
+        __syncthreads();
+
+        int num_flags = (num_hyperedges + 63) / 64;
+        unsigned long long *left_flags = left_hyperedge_flags + blockIdx.x * num_flags;
+        unsigned long long *right_flags = right_hyperedge_flags + blockIdx.x * num_flags;
+
+        for (int idx = 0; idx < num_nodes; idx++) {
+            int v = sorted_nodes[idx];
+            if (partition[v] != p) continue;
+            
+            // Get range of hyperedges for this node
+            int start_pos = node_offsets[v];
+            int end_pos = node_offsets[v+1];
+
+            int left_child = p * 2 + 1;
+            int right_child = p * 2 + 2;
+
+            bool assign_left;
+            if (left_count >= size_left) {
+                assign_left = false;
+            } else if (right_count >= size_right) {
+                assign_left = true;
+            } else {
+                // Loop through this node's hyperedges
+                if (threadIdx.x == 0) {
+                    connections_left = 0;
+                    connections_right = 0;
+                }
+                __syncthreads();
+
+                for (int pos = start_pos + threadIdx.x; pos < end_pos; pos += blockDim.x) {
+                    int hyperedge_idx = node_hyperedges[pos];
+                    if (left_flags[hyperedge_idx / 64] & (1ULL << (hyperedge_idx % 64))) atomicAdd(&connections_left, 1);
+                    if (right_flags[hyperedge_idx / 64] & (1ULL << (hyperedge_idx % 64))) atomicAdd(&connections_right, 1);
+                }
+                __syncthreads();
+                if (connections_left == connections_right) {
+                    assign_left = left_count < right_count;
+                } else {
+                    assign_left = connections_left > connections_right;
+                }
+            }
+
+            if (threadIdx.x == 0) {
+                if (assign_left) {
+                    partition[v] = left_child;
+                    atomicAdd(&left_count, 1);
+                } else {
+                    partition[v] = right_child;
+                    atomicAdd(&right_count, 1);
+                }
+            }
+            unsigned long long *hyperedge_flags = assign_left ? left_flags : right_flags;
+            for (int e = start_pos + threadIdx.x; e < end_pos; e += blockDim.x) {
+                int hyperedge_idx = node_hyperedges[e];
+                atomicOr(&hyperedge_flags[hyperedge_idx / 64], 1ULL << (hyperedge_idx % 64));
+            }
+
+            __syncthreads();
+        }
+    }
+}
+
+extern "C" __global__ void finalize_bipartition(
+    const int num_nodes,
+    const int num_parts,
+    int *partition
+) {    
+    for (int v = threadIdx.x; v < num_nodes; v += blockDim.x) {
+        if (partition[v] != -1) {
+            partition[v] -= (num_parts - 1);
+        }
+    }
+}
+
+extern "C" __global__ void shuffle_nodes(
+    const uint8_t *seed,
+    const int num_nodes,
+    const int *partition,
+    const int *hyperedge_sizes,
+    const int *hyperedge_offsets,
+    const int *hyperedge_nodes,
+    const int *node_degrees,
+    const int *node_hyperedges,
+    const int *node_offsets,
+    const float *node_weights,
+    const int *sorted_nodes,
+    float *rand_weights,
+    int *shuffled_partition,
+    int *shuffled_hyperedge_nodes,
+    float *shuffled_node_weights,
+    int *shuffled_node_degrees,
+    uint *num_prune
+) {
+    curandState state;
+    curand_init(((uint64_t *)(seed))[3], 0, 0, &state);
+    for (int idx = 0; idx < num_nodes; idx++) {
+        rand_weights[idx] = curand_uniform(&state);
+    }
+
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+        while (node_degrees[sorted_nodes[num_nodes - *num_prune - 1]] == 0) {
+            (*num_prune)++;
+        }
+    }
+
+    for (int node_idx = threadIdx.x + blockIdx.x * blockDim.x; node_idx < num_nodes; node_idx += blockDim.x * gridDim.x) {
+        if (node_degrees[node_idx] == 0) {
+            continue;
+        }
+
+        int pos = 0;
+        for (int j = 0; j < num_nodes; j++) {
+            if (node_idx == j || node_degrees[j] == 0) {
+                continue;
+            }
+            if (
+                (rand_weights[node_idx] > rand_weights[j]) || 
+                (rand_weights[node_idx] == rand_weights[j] && node_idx > j)
+            ) {
+                pos++;
+            }
+        }
+        shuffled_node_weights[pos] = node_weights[node_idx];
+        shuffled_partition[pos] = partition[node_idx];
+        shuffled_node_degrees[pos] = node_degrees[node_idx];
+        for (int i = 0; i < node_degrees[node_idx]; i++) {
+            int hyperedge_idx = node_hyperedges[node_offsets[node_idx] + i];
+            int offset = hyperedge_offsets[hyperedge_idx];
+            int pos2 = binary_search(&hyperedge_nodes[offset], hyperedge_sizes[hyperedge_idx], node_idx);
+            shuffled_hyperedge_nodes[offset + pos2] = pos;
+        }
+    }
+}
+
+extern "C" __global__ void finalize_shuffle(
+    const uint8_t *seed,
+    const int num_hyperedges,
+    const int *hyperedge_sizes,
+    const int *hyperedge_offsets,
+    int *shuffled_hyperedge_nodes
+) {
+    for (int hyperedge_idx = threadIdx.x + blockIdx.x * gridDim.x; hyperedge_idx < num_hyperedges; hyperedge_idx += blockDim.x * gridDim.x) {
+        int hyperedge_size = hyperedge_sizes[hyperedge_idx];
+        int hyperedge_offset = hyperedge_offsets[hyperedge_idx];
+
+        for (int i = hyperedge_size / 2 - 1; i >= 0; i--)
+        {
+            heapify(&shuffled_hyperedge_nodes[hyperedge_offset], i, hyperedge_size);
+        }
+
+        for (int i = hyperedge_size - 1; i > 0; i--) 
+        {
+            swap_nodes(&shuffled_hyperedge_nodes[hyperedge_offset], &shuffled_hyperedge_nodes[hyperedge_offset + i]);
+            heapify(&shuffled_hyperedge_nodes[hyperedge_offset], 0, i);
+        }
+    }
+}
+
+extern "C" __global__ void validate_partition(
+    const int num_nodes,
+    const int num_parts,
+    const int *partition,
+    unsigned int *errorflag
+) {
+    for (int node_idx = threadIdx.x; node_idx < num_nodes; node_idx += blockDim.x) {
+        int part = partition[node_idx];
+        
+        // Validate partition (redundant but keeping for safety)
+        if (part < 0 || part >= num_parts) {
+            atomicOr(errorflag, 1u);
+            return;
+        }
+    }
+}
+
+extern "C" __global__ void calc_connectivity_metric(
+    const int num_hyperedges,
+    const int *hyperedge_offsets,
+    const int *hyperedge_nodes,
+    const int *partition,
+    uint *connectivity_metric
+) {
+    for (int hyperedge_idx = threadIdx.x + blockIdx.x * blockDim.x; hyperedge_idx < num_hyperedges; hyperedge_idx += blockDim.x * gridDim.x) {
+        int start = hyperedge_offsets[hyperedge_idx];
+        int end = hyperedge_offsets[hyperedge_idx + 1];
+        
+        // Count unique parts for this hyperedge
+        uint64_t hyperedge_part_flags = 0;
+        for (int pos = start; pos < end; pos++) {
+            int node = hyperedge_nodes[pos];
+            int part = partition[node];
+            
+            hyperedge_part_flags |= (1ULL << part);
+        }
+        
+        // Add to connectivity sum
+        int connectivity = __popcll(hyperedge_part_flags);
+        atomicAdd(connectivity_metric, connectivity - 1);
+    }
+}
+
+extern "C" __global__ void count_nodes_per_part(
+    const int num_nodes,
+    const int num_parts,
+    const int *partition,
+    int *nodes_per_part
+) {
+    for (int node_idx = threadIdx.x + blockIdx.x * blockDim.x; node_idx < num_nodes; node_idx += blockDim.x * gridDim.x) {
+        int part = partition[node_idx];        
+        atomicAdd(&nodes_per_part[part], 1);
+    }
+}
diff --git a/tig-challenges/src/hypergraph.rs b/tig-challenges/src/hypergraph.rs
new file mode 100644
index 0000000..b5423cd
--- /dev/null
+++ b/tig-challenges/src/hypergraph.rs
@@ -0,0 +1,514 @@
+use anyhow::{anyhow, Result};
+use cudarc::driver::*;
+use cudarc::runtime::sys::cudaDeviceProp;
+use rand::{rngs::StdRng, Rng, SeedableRng};
+use serde::{Deserialize, Serialize};
+use serde_json::{from_value, Map, Value};
+use std::sync::Arc;
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct Difficulty {
+    pub num_hyperedges: u32,
+    pub better_than_baseline: u32,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct Solution {
+    pub sub_solutions: Vec<SubSolution>,
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct SubSolution {
+    pub partition: Vec<u32>,
+}
+
+impl TryFrom<Map<String, Value>> for Solution {
+    type Error = serde_json::Error;
+
+    fn try_from(v: Map<String, Value>) -> Result<Self, Self::Error> {
+        from_value(Value::Object(v))
+    }
+}
+
+pub struct Challenge {
+    pub seed: [u8; 32],
+    pub difficulty: Difficulty,
+    pub sub_instances: Vec<SubInstance>,
+}
+
+pub struct SubInstance {
+    pub seed: [u8; 32],
+    pub difficulty: Difficulty,
+    pub num_nodes: u32,
+    pub num_parts: u32,
+    pub max_partition_size: u32,
+    pub d_hyperedge_sizes: CudaSlice<i32>,
+    // start = hyperedge_offsets[i], end = hyperedge_offsets[i + 1]
+    // nodes_in_hyperedge_i = hyperedge_nodes[start..end]
+    pub d_hyperedge_offsets: CudaSlice<i32>,
+    pub d_hyperedge_nodes: CudaSlice<i32>,
+    // start = node_offsets[j], end = node_offsets[j + 1]
+    // hyperedge_with_node_j = node_hyperedges[start..end]
+    pub d_node_degrees: CudaSlice<i32>,
+    pub d_node_offsets: CudaSlice<i32>,
+    pub d_node_hyperedges: CudaSlice<i32>,
+    pub baseline_connectivity_metric: u32,
+}
+
+pub const NUM_SUB_INSTANCES: usize = 16;
+
+impl Challenge {
+    pub fn generate_instance(
+        seed: &[u8; 32],
+        difficulty: &Difficulty,
+        module: Arc<CudaModule>,
+        stream: Arc<CudaStream>,
+        prop: &cudaDeviceProp,
+    ) -> Result<Challenge> {
+        let mut rng = StdRng::from_seed(seed.clone());
+        let mut sub_instances = Vec::new();
+        for _ in 0..NUM_SUB_INSTANCES {
+            sub_instances.push(SubInstance::generate_instance(
+                &rng.gen(),
+                difficulty,
+                module.clone(),
+                stream.clone(),
+                prop,
+            )?);
+        }
+
+        Ok(Challenge {
+            seed: seed.clone(),
+            difficulty: difficulty.clone(),
+            sub_instances,
+        })
+    }
+
+    pub fn verify_solution(
+        &self,
+        solution: &Solution,
+        module: Arc<CudaModule>,
+        stream: Arc<CudaStream>,
+        prop: &cudaDeviceProp,
+    ) -> Result<()> {
+        let mut better_than_baselines = Vec::new();
+        for (i, (sub_instance, sub_solution)) in self
+            .sub_instances
+            .iter()
+            .zip(&solution.sub_solutions)
+            .enumerate()
+        {
+            match sub_instance.verify_solution(sub_solution, module.clone(), stream.clone(), prop) {
+                Ok(connectivity_metric) => better_than_baselines.push(
+                    connectivity_metric as f64 / sub_instance.baseline_connectivity_metric as f64,
+                ),
+                Err(e) => return Err(anyhow!("Instance {}: {}", i, e.to_string())),
+            }
+        }
+        let average = (better_than_baselines.iter().map(|x| x * x).sum::<f64>()
+            / better_than_baselines.len() as f64)
+            .sqrt()
+            - 1.0;
+        let threshold = self.difficulty.better_than_baseline as f64 / 1000.0;
+        if average >= threshold {
+            Ok(())
+        } else {
+            Err(anyhow!(
+                "Average better_than_baseline ({}) is less than ({})",
+                average,
+                threshold
+            ))
+        }
+    }
+}
+
+impl SubInstance {
+    pub fn generate_instance(
+        seed: &[u8; 32],
+        difficulty: &Difficulty,
+        module: Arc<CudaModule>,
+        stream: Arc<CudaStream>,
+        prop: &cudaDeviceProp,
+    ) -> Result<Self> {
+        let num_hyperedges = difficulty.num_hyperedges;
+        let target_num_nodes = difficulty.num_hyperedges; // actual number may be around 8% less
+        let depth = 6;
+        let num_parts = 1 << depth; // 2^6 = 64 partitions
+        let level_weights: Vec<f32> = vec![
+            42.0, 31023.0, 51220.0, 39820.0, 1360.0, 500.0, 208.0, 11.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+            0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        ];
+
+        // Get kernels
+        let generate_hyperedge_sizes_kernel = module.load_function("generate_hyperedge_sizes")?;
+        let finalize_hyperedge_sizes_kernel = module.load_function("finalize_hyperedge_sizes")?;
+        let generate_node_weights_kernel = module.load_function("generate_node_weights")?;
+        let generate_hyperedges_kernel = module.load_function("generate_hyperedges")?;
+        let finalize_hyperedges_kernel = module.load_function("finalize_hyperedges")?;
+        let initialize_partitioning_kernel = module.load_function("initialize_partitioning")?;
+        let greedy_bipartition_kernel = module.load_function("greedy_bipartition")?;
+        let finalize_bipartition_kernel = module.load_function("finalize_bipartition")?;
+        let shuffle_nodes_kernel = module.load_function("shuffle_nodes")?;
+        let finalize_shuffle_kernel = module.load_function("finalize_shuffle")?;
+        let calc_connectivity_metric_kernel = module.load_function("calc_connectivity_metric")?;
+
+        let block_size = prop.maxThreadsPerBlock as u32;
+        let cfg = LaunchConfig {
+            grid_dim: ((num_hyperedges + block_size - 1) / block_size, 1, 1),
+            block_dim: (block_size, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let one_block_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (block_size, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let one_thread_cfg = LaunchConfig {
+            grid_dim: (1, 1, 1),
+            block_dim: (1, 1, 1),
+            shared_mem_bytes: 0,
+        };
+
+        let d_seed = stream.memcpy_stod(seed)?;
+        let mut d_hyperedge_sizes = stream.alloc_zeros::<i32>(num_hyperedges as usize)?;
+        let mut d_hyperedge_offsets = stream.alloc_zeros::<i32>((num_hyperedges + 1) as usize)?;
+        let mut d_node_weights = stream.alloc_zeros::<f32>(target_num_nodes as usize)?;
+        let mut d_total_connections = stream.alloc_zeros::<u32>(1)?;
+
+        // 1.1 Generate node weights
+        let min_node_weight: f32 = 1.0;
+        let max_node_weight: f32 = 4966.0;
+        let alpha: f32 = 1.0 - 2.2864;
+        unsafe {
+            stream
+                .launch_builder(&generate_node_weights_kernel)
+                .arg(&d_seed)
+                .arg(&target_num_nodes)
+                .arg(&min_node_weight)
+                .arg(&max_node_weight)
+                .arg(&alpha)
+                .arg(&mut d_node_weights)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        // 1.2 Generate hyperedge sizes
+        let min_hyperedge_size: u32 = 2;
+        let max_hyperedge_size: u32 = 1954.min(target_num_nodes);
+        let alpha: f32 = 1.0 - 2.5608;
+        unsafe {
+            stream
+                .launch_builder(&generate_hyperedge_sizes_kernel)
+                .arg(&d_seed)
+                .arg(&num_hyperedges)
+                .arg(&min_hyperedge_size)
+                .arg(&max_hyperedge_size)
+                .arg(&alpha)
+                .arg(&mut d_hyperedge_sizes)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        // 1.3 Finalize hyperedge sizes
+        unsafe {
+            stream
+                .launch_builder(&finalize_hyperedge_sizes_kernel)
+                .arg(&num_hyperedges)
+                .arg(&d_hyperedge_sizes)
+                .arg(&mut d_hyperedge_offsets)
+                .arg(&mut d_total_connections)
+                .launch(one_thread_cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        // Get hyperedge offsets for hyperedge_nodes allocation
+        let total_connections = stream.memcpy_dtov(&d_total_connections)?[0];
+
+        // 1.4 Generate hyperedges
+        let d_level_weights = stream.memcpy_stod(&level_weights)?;
+        let mut d_hyperedge_nodes = stream.alloc_zeros::<i32>(total_connections as usize)?;
+        let mut d_node_degrees = stream.alloc_zeros::<i32>(target_num_nodes as usize)?;
+        let mut d_node_hyperedges = stream.alloc_zeros::<i32>(total_connections as usize)?;
+        let mut d_node_offsets = stream.alloc_zeros::<i32>(target_num_nodes as usize + 1)?;
+        unsafe {
+            stream
+                .launch_builder(&generate_hyperedges_kernel)
+                .arg(&d_seed)
+                .arg(&target_num_nodes)
+                .arg(&num_hyperedges)
+                .arg(&total_connections)
+                .arg(&d_hyperedge_sizes)
+                .arg(&d_hyperedge_offsets)
+                .arg(&d_node_weights)
+                .arg(&d_level_weights)
+                .arg(&mut d_hyperedge_nodes)
+                .arg(&mut d_node_degrees)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        // 1.5 Finalize hyperedges
+        unsafe {
+            stream
+                .launch_builder(&finalize_hyperedges_kernel)
+                .arg(&target_num_nodes)
+                .arg(&num_hyperedges)
+                .arg(&d_hyperedge_sizes)
+                .arg(&d_hyperedge_offsets)
+                .arg(&d_hyperedge_nodes)
+                .arg(&d_node_degrees)
+                .arg(&mut d_node_hyperedges)
+                .arg(&mut d_node_offsets)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        // 2.1 Initialize partitioning
+        let mut d_partition = stream.alloc_zeros::<i32>(target_num_nodes as usize)?;
+        let mut d_sorted_nodes = stream.alloc_zeros::<i32>(target_num_nodes as usize)?;
+
+        unsafe {
+            stream
+                .launch_builder(&initialize_partitioning_kernel)
+                .arg(&target_num_nodes)
+                .arg(&d_node_degrees)
+                .arg(&mut d_partition)
+                .arg(&mut d_sorted_nodes)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        // 2.2 Greedy bipartitioning
+        for level in 1..depth {
+            let num_parts_this_level = 1 << level;
+            let num_flags = (num_hyperedges + 63) / 64 * num_parts_this_level;
+            let mut d_left_hyperedge_flags = stream.alloc_zeros::<u64>(num_flags as usize)?;
+            let mut d_right_hyperedge_flags = stream.alloc_zeros::<u64>(num_flags as usize)?;
+
+            unsafe {
+                stream
+                    .launch_builder(&greedy_bipartition_kernel)
+                    .arg(&level)
+                    .arg(&target_num_nodes)
+                    .arg(&num_hyperedges)
+                    .arg(&d_node_hyperedges)
+                    .arg(&d_node_offsets)
+                    .arg(&d_sorted_nodes)
+                    .arg(&d_node_degrees)
+                    .arg(&mut d_partition)
+                    .arg(&mut d_left_hyperedge_flags)
+                    .arg(&mut d_right_hyperedge_flags)
+                    .launch(LaunchConfig {
+                        grid_dim: (num_parts_this_level as u32, 1, 1),
+                        block_dim: (block_size, 1, 1),
+                        shared_mem_bytes: 400,
+                    })?;
+            }
+            stream.synchronize()?;
+        }
+
+        // 2.3 Finalize bipartitioning
+        unsafe {
+            stream
+                .launch_builder(&finalize_bipartition_kernel)
+                .arg(&target_num_nodes)
+                .arg(&num_parts)
+                .arg(&mut d_partition)
+                .launch(one_block_cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        // 3.1 Shuffle nodes
+        let mut d_rand_weights = stream.alloc_zeros::<f32>(target_num_nodes as usize)?;
+        let mut d_shuffled_partition = stream.alloc_zeros::<i32>(target_num_nodes as usize)?;
+        let mut d_shuffled_hyperedge_nodes =
+            stream.alloc_zeros::<i32>(total_connections as usize)?;
+        let mut d_shuffled_node_weights = stream.alloc_zeros::<f32>(target_num_nodes as usize)?;
+        let mut d_shuffled_node_degrees = stream.alloc_zeros::<i32>(target_num_nodes as usize)?;
+        let mut d_num_prune = stream.alloc_zeros::<u32>(1)?;
+        unsafe {
+            stream
+                .launch_builder(&shuffle_nodes_kernel)
+                .arg(&d_seed)
+                .arg(&target_num_nodes)
+                .arg(&d_partition)
+                .arg(&d_hyperedge_sizes)
+                .arg(&d_hyperedge_offsets)
+                .arg(&d_hyperedge_nodes)
+                .arg(&d_node_degrees)
+                .arg(&d_node_hyperedges)
+                .arg(&d_node_offsets)
+                .arg(&d_node_weights)
+                .arg(&d_sorted_nodes)
+                .arg(&mut d_rand_weights)
+                .arg(&mut d_shuffled_partition)
+                .arg(&mut d_shuffled_hyperedge_nodes)
+                .arg(&mut d_shuffled_node_weights)
+                .arg(&mut d_shuffled_node_degrees)
+                .arg(&mut d_num_prune)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        let num_prune = stream.memcpy_dtov(&d_num_prune)?[0];
+        let num_nodes = target_num_nodes - num_prune;
+
+        // 3.2 Finalize shuffle
+        unsafe {
+            stream
+                .launch_builder(&finalize_shuffle_kernel)
+                .arg(&d_seed)
+                .arg(&num_hyperedges)
+                .arg(&d_hyperedge_sizes)
+                .arg(&d_hyperedge_offsets)
+                .arg(&mut d_shuffled_hyperedge_nodes)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        // 3.3 Reconstruct node hyperedges and offsets
+        let mut d_shuffled_node_hyperedges =
+            stream.alloc_zeros::<i32>(total_connections as usize)?;
+        let mut d_shuffled_node_offsets = stream.alloc_zeros::<i32>(num_nodes as usize + 1)?;
+        unsafe {
+            stream
+                .launch_builder(&finalize_hyperedges_kernel)
+                .arg(&num_nodes)
+                .arg(&num_hyperedges)
+                .arg(&d_hyperedge_sizes)
+                .arg(&d_hyperedge_offsets)
+                .arg(&d_shuffled_hyperedge_nodes)
+                .arg(&d_shuffled_node_degrees)
+                .arg(&mut d_shuffled_node_hyperedges)
+                .arg(&mut d_shuffled_node_offsets)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        // 4.1 Calculate connectivity
+        let mut d_connectivity_metric = stream.alloc_zeros::<u32>(1)?;
+
+        unsafe {
+            stream
+                .launch_builder(&calc_connectivity_metric_kernel)
+                .arg(&num_hyperedges)
+                .arg(&d_hyperedge_offsets)
+                .arg(&d_shuffled_hyperedge_nodes)
+                .arg(&d_shuffled_partition)
+                .arg(&mut d_connectivity_metric)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        let connectivity_metric = stream.memcpy_dtov(&d_connectivity_metric)?[0];
+        let max_partition_size = ((num_nodes as f32 / num_parts as f32) * 1.03).ceil() as u32;
+
+        Ok(Self {
+            seed: *seed,
+            difficulty: difficulty.clone(),
+            num_nodes: target_num_nodes - num_prune,
+            num_parts,
+            max_partition_size,
+            d_hyperedge_sizes,
+            d_hyperedge_offsets,
+            d_hyperedge_nodes: d_shuffled_hyperedge_nodes,
+            d_node_degrees: d_shuffled_node_degrees,
+            d_node_offsets: d_shuffled_node_offsets,
+            d_node_hyperedges: d_shuffled_node_hyperedges,
+            baseline_connectivity_metric: connectivity_metric,
+        })
+    }
+
+    pub fn verify_solution(
+        &self,
+        solution: &SubSolution,
+        module: Arc<CudaModule>,
+        stream: Arc<CudaStream>,
+        prop: &cudaDeviceProp,
+    ) -> Result<u32> {
+        if solution.partition.len() != self.num_nodes as usize {
+            return Err(anyhow!(
+                "Invalid number of partitions. Expected: {}, Actual: {}",
+                self.num_nodes,
+                solution.partition.len()
+            ));
+        }
+
+        // Get the kernels
+        let validate_partition_kernel = module.load_function("validate_partition")?;
+        let calc_connectivity_metric_kernel = module.load_function("calc_connectivity_metric")?;
+        let count_nodes_per_part_kernel = module.load_function("count_nodes_per_part")?;
+
+        let block_size = prop.maxThreadsPerBlock as u32;
+        let grid_size = (self.difficulty.num_hyperedges + block_size - 1) / block_size;
+
+        let cfg = LaunchConfig {
+            grid_dim: (grid_size, 1, 1),
+            block_dim: (block_size, 1, 1),
+            shared_mem_bytes: 0,
+        };
+
+        // 1.1 Check if all nodes are assigned to a part
+        let d_partition = stream.memcpy_stod(&solution.partition)?;
+        let mut d_error_flag = stream.alloc_zeros::<u32>(1)?;
+
+        unsafe {
+            stream
+                .launch_builder(&validate_partition_kernel)
+                .arg(&self.num_nodes)
+                .arg(&self.num_parts)
+                .arg(&d_partition)
+                .arg(&mut d_error_flag)
+                .launch(cfg)?;
+        }
+        stream.synchronize()?;
+
+        if stream.memcpy_dtov(&d_error_flag)?[0] != 0 {
+            return Err(anyhow!(
+                "Invalid partition. All nodes must be assigned to one of {} parts",
+                self.num_parts
+            ));
+        };
+
+        // 1.2 Check if any partition exceeds the maximum size
+        let mut d_nodes_per_part = stream.alloc_zeros::<u32>(self.num_parts as usize)?;
+        unsafe {
+            stream
+                .launch_builder(&count_nodes_per_part_kernel)
+                .arg(&self.num_nodes)
+                .arg(&self.num_parts)
+                .arg(&d_partition)
+                .arg(&mut d_nodes_per_part)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        let nodes_per_partition = stream.memcpy_dtov(&d_nodes_per_part)?;
+        if nodes_per_partition
+            .iter()
+            .any(|&x| x < 1 || x > self.max_partition_size)
+        {
+            return Err(anyhow!(
+                "Each part must have at least 1 and at most {} nodes",
+                self.max_partition_size
+            ));
+        }
+
+        // 1.3 Calculate connectivity
+        let mut d_connectivity_metric = stream.alloc_zeros::<u32>(1)?;
+        unsafe {
+            stream
+                .launch_builder(&calc_connectivity_metric_kernel)
+                .arg(&self.difficulty.num_hyperedges)
+                .arg(&self.d_hyperedge_offsets)
+                .arg(&self.d_hyperedge_nodes)
+                .arg(&d_partition)
+                .arg(&mut d_connectivity_metric)
+                .launch(cfg.clone())?;
+        }
+        stream.synchronize()?;
+
+        let connectivity_metric = stream.memcpy_dtov(&d_connectivity_metric)?[0];
+        Ok(connectivity_metric)
+    }
+}