diff --git a/.github/workflows/build_algorithm.yml b/.github/workflows/build_algorithm.yml
index 30004d3..14725d0 100644
--- a/.github/workflows/build_algorithm.yml
+++ b/.github/workflows/build_algorithm.yml
@@ -8,16 +8,13 @@ on:
       - 'knapsack/*'
       - 'vector_search/*'
       - 'hypergraph/*'
+      - 'nn_training/*'
       - 'test/satisfiability/*'
       - 'test/vehicle_routing/*'
       - 'test/knapsack/*'
       - 'test/vector_search/*'
       - 'test/hypergraph/*'
-      - 'dev/satisfiability/*'
-      - 'dev/vehicle_routing/*'
-      - 'dev/knapsack/*'
-      - 'dev/vector_search/*'
-      - 'dev/hypergraph/*'
+      - 'test/nn_training/*'
 
 jobs:
   init:
diff --git a/scripts/test_algorithm b/scripts/test_algorithm
index 0576753..3024056 100644
--- a/scripts/test_algorithm
+++ b/scripts/test_algorithm
@@ -95,7 +95,7 @@ f"""Library not found at {so_path}:
         "knapsack": "c003",
         "vector_search": "c004",
         "hypergraph": "c005",
-        "optimiser": "c006",
+        "nn_training": "c006",
     }
     challenge_id = challenge_ids[CHALLENGE]
 
diff --git a/tig-algorithms/Cargo.toml b/tig-algorithms/Cargo.toml
index f8d1d22..248c18e 100644
--- a/tig-algorithms/Cargo.toml
+++ b/tig-algorithms/Cargo.toml
@@ -23,14 +23,15 @@ tig-challenges = { path = "../tig-challenges" }
 crate-type = ["cdylib", "rlib"]
 
 [features]
-cuda = ["cudarc"]
 c001 = ["tig-challenges/c001"]
 satisfiability = ["c001"]
 c002 = ["tig-challenges/c002"]
 vehicle_routing = ["c002"]
 c003 = ["tig-challenges/c003"]
 knapsack = ["c003"]
-c004 = ["cuda", "tig-challenges/c004"]
+c004 = ["cudarc", "tig-challenges/c004"]
 vector_search = ["c004"]
-c005 = ["cuda", "tig-challenges/c005"]
+c005 = ["cudarc", "tig-challenges/c005"]
 hypergraph = ["c005"]
+c006 = ["cudarc", "tig-challenges/c006"]
+optimizer = ["c006"]
diff --git a/tig-algorithms/src/lib.rs b/tig-algorithms/src/lib.rs
index 67a1f54..9e60adb 100644
--- a/tig-algorithms/src/lib.rs
+++ b/tig-algorithms/src/lib.rs
@@ -31,3 +31,7 @@ pub use vector_search as c004;
 pub mod hypergraph;
 #[cfg(feature = "c005")]
 pub use hypergraph as c005;
+#[cfg(feature = "c006")]
+pub mod nn_training;
+#[cfg(feature = "c006")]
+pub use nn_training as c006;
diff --git a/tig-algorithms/src/nn_training/mod.rs b/tig-algorithms/src/nn_training/mod.rs
new file mode 100644
index 0000000..78868ab
--- /dev/null
+++ b/tig-algorithms/src/nn_training/mod.rs
@@ -0,0 +1,1997 @@
+// c006_a001
+
+// c006_a002
+
+// c006_a003
+
+// c006_a004
+
+// c006_a005
+
+// c006_a006
+
+// c006_a007
+
+// c006_a008
+
+// c006_a009
+
+// c006_a010
+
+// c006_a011
+
+// c006_a012
+
+// c006_a013
+
+// c006_a014
+
+// c006_a015
+
+// c006_a016
+
+// c006_a017
+
+// c006_a018
+
+// c006_a019
+
+// c006_a020
+
+// c006_a021
+
+// c006_a022
+
+// c006_a023
+
+// c006_a024
+
+// c006_a025
+
+// c006_a026
+
+// c006_a027
+
+// c006_a028
+
+// c006_a029
+
+// c006_a030
+
+// c006_a031
+
+// c006_a032
+
+// c006_a033
+
+// c006_a034
+
+// c006_a035
+
+// c006_a036
+
+// c006_a037
+
+// c006_a038
+
+// c006_a039
+
+// c006_a040
+
+// c006_a041
+
+// c006_a042
+
+// c006_a043
+
+// c006_a044
+
+// c006_a045
+
+// c006_a046
+
+// c006_a047
+
+// c006_a048
+
+// c006_a049
+
+// c006_a050
+
+// c006_a051
+
+// c006_a052
+
+// c006_a053
+
+// c006_a054
+
+// c006_a055
+
+// c006_a056
+
+// c006_a057
+
+// c006_a058
+
+// c006_a059
+
+// c006_a060
+
+// c006_a061
+
+// c006_a062
+
+// c006_a063
+
+// c006_a064
+
+// c006_a065
+
+// c006_a066
+
+// c006_a067
+
+// c006_a068
+
+// c006_a069
+
+// c006_a070
+
+// c006_a071
+
+// c006_a072
+
+// c006_a073
+
+// c006_a074
+
+// c006_a075
+
+// c006_a076
+
+// c006_a077
+
+// c006_a078
+
+// c006_a079
+
+// c006_a080
+
+// c006_a081
+
+// c006_a082
+
+// c006_a083
+
+// c006_a084
+
+// c006_a085
+
+// c006_a086
+
+// c006_a087
+
+// c006_a088
+
+// c006_a089
+
+// c006_a090
+
+// c006_a091
+
+// c006_a092
+
+// c006_a093
+
+// c006_a094
+
+// c006_a095
+
+// c006_a096
+
+// c006_a097
+
+// c006_a098
+
+// c006_a099
+
+// c006_a100
+
+// c006_a101
+
+// c006_a102
+
+// c006_a103
+
+// c006_a104
+
+// c006_a105
+
+// c006_a106
+
+// c006_a107
+
+// c006_a108
+
+// c006_a109
+
+// c006_a110
+
+// c006_a111
+
+// c006_a112
+
+// c006_a113
+
+// c006_a114
+
+// c006_a115
+
+// c006_a116
+
+// c006_a117
+
+// c006_a118
+
+// c006_a119
+
+// c006_a120
+
+// c006_a121
+
+// c006_a122
+
+// c006_a123
+
+// c006_a124
+
+// c006_a125
+
+// c006_a126
+
+// c006_a127
+
+// c006_a128
+
+// c006_a129
+
+// c006_a130
+
+// c006_a131
+
+// c006_a132
+
+// c006_a133
+
+// c006_a134
+
+// c006_a135
+
+// c006_a136
+
+// c006_a137
+
+// c006_a138
+
+// c006_a139
+
+// c006_a140
+
+// c006_a141
+
+// c006_a142
+
+// c006_a143
+
+// c006_a144
+
+// c006_a145
+
+// c006_a146
+
+// c006_a147
+
+// c006_a148
+
+// c006_a149
+
+// c006_a150
+
+// c006_a151
+
+// c006_a152
+
+// c006_a153
+
+// c006_a154
+
+// c006_a155
+
+// c006_a156
+
+// c006_a157
+
+// c006_a158
+
+// c006_a159
+
+// c006_a160
+
+// c006_a161
+
+// c006_a162
+
+// c006_a163
+
+// c006_a164
+
+// c006_a165
+
+// c006_a166
+
+// c006_a167
+
+// c006_a168
+
+// c006_a169
+
+// c006_a170
+
+// c006_a171
+
+// c006_a172
+
+// c006_a173
+
+// c006_a174
+
+// c006_a175
+
+// c006_a176
+
+// c006_a177
+
+// c006_a178
+
+// c006_a179
+
+// c006_a180
+
+// c006_a181
+
+// c006_a182
+
+// c006_a183
+
+// c006_a184
+
+// c006_a185
+
+// c006_a186
+
+// c006_a187
+
+// c006_a188
+
+// c006_a189
+
+// c006_a190
+
+// c006_a191
+
+// c006_a192
+
+// c006_a193
+
+// c006_a194
+
+// c006_a195
+
+// c006_a196
+
+// c006_a197
+
+// c006_a198
+
+// c006_a199
+
+// c006_a200
+
+// c006_a201
+
+// c006_a202
+
+// c006_a203
+
+// c006_a204
+
+// c006_a205
+
+// c006_a206
+
+// c006_a207
+
+// c006_a208
+
+// c006_a209
+
+// c006_a210
+
+// c006_a211
+
+// c006_a212
+
+// c006_a213
+
+// c006_a214
+
+// c006_a215
+
+// c006_a216
+
+// c006_a217
+
+// c006_a218
+
+// c006_a219
+
+// c006_a220
+
+// c006_a221
+
+// c006_a222
+
+// c006_a223
+
+// c006_a224
+
+// c006_a225
+
+// c006_a226
+
+// c006_a227
+
+// c006_a228
+
+// c006_a229
+
+// c006_a230
+
+// c006_a231
+
+// c006_a232
+
+// c006_a233
+
+// c006_a234
+
+// c006_a235
+
+// c006_a236
+
+// c006_a237
+
+// c006_a238
+
+// c006_a239
+
+// c006_a240
+
+// c006_a241
+
+// c006_a242
+
+// c006_a243
+
+// c006_a244
+
+// c006_a245
+
+// c006_a246
+
+// c006_a247
+
+// c006_a248
+
+// c006_a249
+
+// c006_a250
+
+// c006_a251
+
+// c006_a252
+
+// c006_a253
+
+// c006_a254
+
+// c006_a255
+
+// c006_a256
+
+// c006_a257
+
+// c006_a258
+
+// c006_a259
+
+// c006_a260
+
+// c006_a261
+
+// c006_a262
+
+// c006_a263
+
+// c006_a264
+
+// c006_a265
+
+// c006_a266
+
+// c006_a267
+
+// c006_a268
+
+// c006_a269
+
+// c006_a270
+
+// c006_a271
+
+// c006_a272
+
+// c006_a273
+
+// c006_a274
+
+// c006_a275
+
+// c006_a276
+
+// c006_a277
+
+// c006_a278
+
+// c006_a279
+
+// c006_a280
+
+// c006_a281
+
+// c006_a282
+
+// c006_a283
+
+// c006_a284
+
+// c006_a285
+
+// c006_a286
+
+// c006_a287
+
+// c006_a288
+
+// c006_a289
+
+// c006_a290
+
+// c006_a291
+
+// c006_a292
+
+// c006_a293
+
+// c006_a294
+
+// c006_a295
+
+// c006_a296
+
+// c006_a297
+
+// c006_a298
+
+// c006_a299
+
+// c006_a300
+
+// c006_a301
+
+// c006_a302
+
+// c006_a303
+
+// c006_a304
+
+// c006_a305
+
+// c006_a306
+
+// c006_a307
+
+// c006_a308
+
+// c006_a309
+
+// c006_a310
+
+// c006_a311
+
+// c006_a312
+
+// c006_a313
+
+// c006_a314
+
+// c006_a315
+
+// c006_a316
+
+// c006_a317
+
+// c006_a318
+
+// c006_a319
+
+// c006_a320
+
+// c006_a321
+
+// c006_a322
+
+// c006_a323
+
+// c006_a324
+
+// c006_a325
+
+// c006_a326
+
+// c006_a327
+
+// c006_a328
+
+// c006_a329
+
+// c006_a330
+
+// c006_a331
+
+// c006_a332
+
+// c006_a333
+
+// c006_a334
+
+// c006_a335
+
+// c006_a336
+
+// c006_a337
+
+// c006_a338
+
+// c006_a339
+
+// c006_a340
+
+// c006_a341
+
+// c006_a342
+
+// c006_a343
+
+// c006_a344
+
+// c006_a345
+
+// c006_a346
+
+// c006_a347
+
+// c006_a348
+
+// c006_a349
+
+// c006_a350
+
+// c006_a351
+
+// c006_a352
+
+// c006_a353
+
+// c006_a354
+
+// c006_a355
+
+// c006_a356
+
+// c006_a357
+
+// c006_a358
+
+// c006_a359
+
+// c006_a360
+
+// c006_a361
+
+// c006_a362
+
+// c006_a363
+
+// c006_a364
+
+// c006_a365
+
+// c006_a366
+
+// c006_a367
+
+// c006_a368
+
+// c006_a369
+
+// c006_a370
+
+// c006_a371
+
+// c006_a372
+
+// c006_a373
+
+// c006_a374
+
+// c006_a375
+
+// c006_a376
+
+// c006_a377
+
+// c006_a378
+
+// c006_a379
+
+// c006_a380
+
+// c006_a381
+
+// c006_a382
+
+// c006_a383
+
+// c006_a384
+
+// c006_a385
+
+// c006_a386
+
+// c006_a387
+
+// c006_a388
+
+// c006_a389
+
+// c006_a390
+
+// c006_a391
+
+// c006_a392
+
+// c006_a393
+
+// c006_a394
+
+// c006_a395
+
+// c006_a396
+
+// c006_a397
+
+// c006_a398
+
+// c006_a399
+
+// c006_a400
+
+// c006_a401
+
+// c006_a402
+
+// c006_a403
+
+// c006_a404
+
+// c006_a405
+
+// c006_a406
+
+// c006_a407
+
+// c006_a408
+
+// c006_a409
+
+// c006_a410
+
+// c006_a411
+
+// c006_a412
+
+// c006_a413
+
+// c006_a414
+
+// c006_a415
+
+// c006_a416
+
+// c006_a417
+
+// c006_a418
+
+// c006_a419
+
+// c006_a420
+
+// c006_a421
+
+// c006_a422
+
+// c006_a423
+
+// c006_a424
+
+// c006_a425
+
+// c006_a426
+
+// c006_a427
+
+// c006_a428
+
+// c006_a429
+
+// c006_a430
+
+// c006_a431
+
+// c006_a432
+
+// c006_a433
+
+// c006_a434
+
+// c006_a435
+
+// c006_a436
+
+// c006_a437
+
+// c006_a438
+
+// c006_a439
+
+// c006_a440
+
+// c006_a441
+
+// c006_a442
+
+// c006_a443
+
+// c006_a444
+
+// c006_a445
+
+// c006_a446
+
+// c006_a447
+
+// c006_a448
+
+// c006_a449
+
+// c006_a450
+
+// c006_a451
+
+// c006_a452
+
+// c006_a453
+
+// c006_a454
+
+// c006_a455
+
+// c006_a456
+
+// c006_a457
+
+// c006_a458
+
+// c006_a459
+
+// c006_a460
+
+// c006_a461
+
+// c006_a462
+
+// c006_a463
+
+// c006_a464
+
+// c006_a465
+
+// c006_a466
+
+// c006_a467
+
+// c006_a468
+
+// c006_a469
+
+// c006_a470
+
+// c006_a471
+
+// c006_a472
+
+// c006_a473
+
+// c006_a474
+
+// c006_a475
+
+// c006_a476
+
+// c006_a477
+
+// c006_a478
+
+// c006_a479
+
+// c006_a480
+
+// c006_a481
+
+// c006_a482
+
+// c006_a483
+
+// c006_a484
+
+// c006_a485
+
+// c006_a486
+
+// c006_a487
+
+// c006_a488
+
+// c006_a489
+
+// c006_a490
+
+// c006_a491
+
+// c006_a492
+
+// c006_a493
+
+// c006_a494
+
+// c006_a495
+
+// c006_a496
+
+// c006_a497
+
+// c006_a498
+
+// c006_a499
+
+// c006_a500
+
+// c006_a501
+
+// c006_a502
+
+// c006_a503
+
+// c006_a504
+
+// c006_a505
+
+// c006_a506
+
+// c006_a507
+
+// c006_a508
+
+// c006_a509
+
+// c006_a510
+
+// c006_a511
+
+// c006_a512
+
+// c006_a513
+
+// c006_a514
+
+// c006_a515
+
+// c006_a516
+
+// c006_a517
+
+// c006_a518
+
+// c006_a519
+
+// c006_a520
+
+// c006_a521
+
+// c006_a522
+
+// c006_a523
+
+// c006_a524
+
+// c006_a525
+
+// c006_a526
+
+// c006_a527
+
+// c006_a528
+
+// c006_a529
+
+// c006_a530
+
+// c006_a531
+
+// c006_a532
+
+// c006_a533
+
+// c006_a534
+
+// c006_a535
+
+// c006_a536
+
+// c006_a537
+
+// c006_a538
+
+// c006_a539
+
+// c006_a540
+
+// c006_a541
+
+// c006_a542
+
+// c006_a543
+
+// c006_a544
+
+// c006_a545
+
+// c006_a546
+
+// c006_a547
+
+// c006_a548
+
+// c006_a549
+
+// c006_a550
+
+// c006_a551
+
+// c006_a552
+
+// c006_a553
+
+// c006_a554
+
+// c006_a555
+
+// c006_a556
+
+// c006_a557
+
+// c006_a558
+
+// c006_a559
+
+// c006_a560
+
+// c006_a561
+
+// c006_a562
+
+// c006_a563
+
+// c006_a564
+
+// c006_a565
+
+// c006_a566
+
+// c006_a567
+
+// c006_a568
+
+// c006_a569
+
+// c006_a570
+
+// c006_a571
+
+// c006_a572
+
+// c006_a573
+
+// c006_a574
+
+// c006_a575
+
+// c006_a576
+
+// c006_a577
+
+// c006_a578
+
+// c006_a579
+
+// c006_a580
+
+// c006_a581
+
+// c006_a582
+
+// c006_a583
+
+// c006_a584
+
+// c006_a585
+
+// c006_a586
+
+// c006_a587
+
+// c006_a588
+
+// c006_a589
+
+// c006_a590
+
+// c006_a591
+
+// c006_a592
+
+// c006_a593
+
+// c006_a594
+
+// c006_a595
+
+// c006_a596
+
+// c006_a597
+
+// c006_a598
+
+// c006_a599
+
+// c006_a600
+
+// c006_a601
+
+// c006_a602
+
+// c006_a603
+
+// c006_a604
+
+// c006_a605
+
+// c006_a606
+
+// c006_a607
+
+// c006_a608
+
+// c006_a609
+
+// c006_a610
+
+// c006_a611
+
+// c006_a612
+
+// c006_a613
+
+// c006_a614
+
+// c006_a615
+
+// c006_a616
+
+// c006_a617
+
+// c006_a618
+
+// c006_a619
+
+// c006_a620
+
+// c006_a621
+
+// c006_a622
+
+// c006_a623
+
+// c006_a624
+
+// c006_a625
+
+// c006_a626
+
+// c006_a627
+
+// c006_a628
+
+// c006_a629
+
+// c006_a630
+
+// c006_a631
+
+// c006_a632
+
+// c006_a633
+
+// c006_a634
+
+// c006_a635
+
+// c006_a636
+
+// c006_a637
+
+// c006_a638
+
+// c006_a639
+
+// c006_a640
+
+// c006_a641
+
+// c006_a642
+
+// c006_a643
+
+// c006_a644
+
+// c006_a645
+
+// c006_a646
+
+// c006_a647
+
+// c006_a648
+
+// c006_a649
+
+// c006_a650
+
+// c006_a651
+
+// c006_a652
+
+// c006_a653
+
+// c006_a654
+
+// c006_a655
+
+// c006_a656
+
+// c006_a657
+
+// c006_a658
+
+// c006_a659
+
+// c006_a660
+
+// c006_a661
+
+// c006_a662
+
+// c006_a663
+
+// c006_a664
+
+// c006_a665
+
+// c006_a666
+
+// c006_a667
+
+// c006_a668
+
+// c006_a669
+
+// c006_a670
+
+// c006_a671
+
+// c006_a672
+
+// c006_a673
+
+// c006_a674
+
+// c006_a675
+
+// c006_a676
+
+// c006_a677
+
+// c006_a678
+
+// c006_a679
+
+// c006_a680
+
+// c006_a681
+
+// c006_a682
+
+// c006_a683
+
+// c006_a684
+
+// c006_a685
+
+// c006_a686
+
+// c006_a687
+
+// c006_a688
+
+// c006_a689
+
+// c006_a690
+
+// c006_a691
+
+// c006_a692
+
+// c006_a693
+
+// c006_a694
+
+// c006_a695
+
+// c006_a696
+
+// c006_a697
+
+// c006_a698
+
+// c006_a699
+
+// c006_a700
+
+// c006_a701
+
+// c006_a702
+
+// c006_a703
+
+// c006_a704
+
+// c006_a705
+
+// c006_a706
+
+// c006_a707
+
+// c006_a708
+
+// c006_a709
+
+// c006_a710
+
+// c006_a711
+
+// c006_a712
+
+// c006_a713
+
+// c006_a714
+
+// c006_a715
+
+// c006_a716
+
+// c006_a717
+
+// c006_a718
+
+// c006_a719
+
+// c006_a720
+
+// c006_a721
+
+// c006_a722
+
+// c006_a723
+
+// c006_a724
+
+// c006_a725
+
+// c006_a726
+
+// c006_a727
+
+// c006_a728
+
+// c006_a729
+
+// c006_a730
+
+// c006_a731
+
+// c006_a732
+
+// c006_a733
+
+// c006_a734
+
+// c006_a735
+
+// c006_a736
+
+// c006_a737
+
+// c006_a738
+
+// c006_a739
+
+// c006_a740
+
+// c006_a741
+
+// c006_a742
+
+// c006_a743
+
+// c006_a744
+
+// c006_a745
+
+// c006_a746
+
+// c006_a747
+
+// c006_a748
+
+// c006_a749
+
+// c006_a750
+
+// c006_a751
+
+// c006_a752
+
+// c006_a753
+
+// c006_a754
+
+// c006_a755
+
+// c006_a756
+
+// c006_a757
+
+// c006_a758
+
+// c006_a759
+
+// c006_a760
+
+// c006_a761
+
+// c006_a762
+
+// c006_a763
+
+// c006_a764
+
+// c006_a765
+
+// c006_a766
+
+// c006_a767
+
+// c006_a768
+
+// c006_a769
+
+// c006_a770
+
+// c006_a771
+
+// c006_a772
+
+// c006_a773
+
+// c006_a774
+
+// c006_a775
+
+// c006_a776
+
+// c006_a777
+
+// c006_a778
+
+// c006_a779
+
+// c006_a780
+
+// c006_a781
+
+// c006_a782
+
+// c006_a783
+
+// c006_a784
+
+// c006_a785
+
+// c006_a786
+
+// c006_a787
+
+// c006_a788
+
+// c006_a789
+
+// c006_a790
+
+// c006_a791
+
+// c006_a792
+
+// c006_a793
+
+// c006_a794
+
+// c006_a795
+
+// c006_a796
+
+// c006_a797
+
+// c006_a798
+
+// c006_a799
+
+// c006_a800
+
+// c006_a801
+
+// c006_a802
+
+// c006_a803
+
+// c006_a804
+
+// c006_a805
+
+// c006_a806
+
+// c006_a807
+
+// c006_a808
+
+// c006_a809
+
+// c006_a810
+
+// c006_a811
+
+// c006_a812
+
+// c006_a813
+
+// c006_a814
+
+// c006_a815
+
+// c006_a816
+
+// c006_a817
+
+// c006_a818
+
+// c006_a819
+
+// c006_a820
+
+// c006_a821
+
+// c006_a822
+
+// c006_a823
+
+// c006_a824
+
+// c006_a825
+
+// c006_a826
+
+// c006_a827
+
+// c006_a828
+
+// c006_a829
+
+// c006_a830
+
+// c006_a831
+
+// c006_a832
+
+// c006_a833
+
+// c006_a834
+
+// c006_a835
+
+// c006_a836
+
+// c006_a837
+
+// c006_a838
+
+// c006_a839
+
+// c006_a840
+
+// c006_a841
+
+// c006_a842
+
+// c006_a843
+
+// c006_a844
+
+// c006_a845
+
+// c006_a846
+
+// c006_a847
+
+// c006_a848
+
+// c006_a849
+
+// c006_a850
+
+// c006_a851
+
+// c006_a852
+
+// c006_a853
+
+// c006_a854
+
+// c006_a855
+
+// c006_a856
+
+// c006_a857
+
+// c006_a858
+
+// c006_a859
+
+// c006_a860
+
+// c006_a861
+
+// c006_a862
+
+// c006_a863
+
+// c006_a864
+
+// c006_a865
+
+// c006_a866
+
+// c006_a867
+
+// c006_a868
+
+// c006_a869
+
+// c006_a870
+
+// c006_a871
+
+// c006_a872
+
+// c006_a873
+
+// c006_a874
+
+// c006_a875
+
+// c006_a876
+
+// c006_a877
+
+// c006_a878
+
+// c006_a879
+
+// c006_a880
+
+// c006_a881
+
+// c006_a882
+
+// c006_a883
+
+// c006_a884
+
+// c006_a885
+
+// c006_a886
+
+// c006_a887
+
+// c006_a888
+
+// c006_a889
+
+// c006_a890
+
+// c006_a891
+
+// c006_a892
+
+// c006_a893
+
+// c006_a894
+
+// c006_a895
+
+// c006_a896
+
+// c006_a897
+
+// c006_a898
+
+// c006_a899
+
+// c006_a900
+
+// c006_a901
+
+// c006_a902
+
+// c006_a903
+
+// c006_a904
+
+// c006_a905
+
+// c006_a906
+
+// c006_a907
+
+// c006_a908
+
+// c006_a909
+
+// c006_a910
+
+// c006_a911
+
+// c006_a912
+
+// c006_a913
+
+// c006_a914
+
+// c006_a915
+
+// c006_a916
+
+// c006_a917
+
+// c006_a918
+
+// c006_a919
+
+// c006_a920
+
+// c006_a921
+
+// c006_a922
+
+// c006_a923
+
+// c006_a924
+
+// c006_a925
+
+// c006_a926
+
+// c006_a927
+
+// c006_a928
+
+// c006_a929
+
+// c006_a930
+
+// c006_a931
+
+// c006_a932
+
+// c006_a933
+
+// c006_a934
+
+// c006_a935
+
+// c006_a936
+
+// c006_a937
+
+// c006_a938
+
+// c006_a939
+
+// c006_a940
+
+// c006_a941
+
+// c006_a942
+
+// c006_a943
+
+// c006_a944
+
+// c006_a945
+
+// c006_a946
+
+// c006_a947
+
+// c006_a948
+
+// c006_a949
+
+// c006_a950
+
+// c006_a951
+
+// c006_a952
+
+// c006_a953
+
+// c006_a954
+
+// c006_a955
+
+// c006_a956
+
+// c006_a957
+
+// c006_a958
+
+// c006_a959
+
+// c006_a960
+
+// c006_a961
+
+// c006_a962
+
+// c006_a963
+
+// c006_a964
+
+// c006_a965
+
+// c006_a966
+
+// c006_a967
+
+// c006_a968
+
+// c006_a969
+
+// c006_a970
+
+// c006_a971
+
+// c006_a972
+
+// c006_a973
+
+// c006_a974
+
+// c006_a975
+
+// c006_a976
+
+// c006_a977
+
+// c006_a978
+
+// c006_a979
+
+// c006_a980
+
+// c006_a981
+
+// c006_a982
+
+// c006_a983
+
+// c006_a984
+
+// c006_a985
+
+// c006_a986
+
+// c006_a987
+
+// c006_a988
+
+// c006_a989
+
+// c006_a990
+
+// c006_a991
+
+// c006_a992
+
+// c006_a993
+
+// c006_a994
+
+// c006_a995
+
+// c006_a996
+
+// c006_a997
+
+// c006_a998
+
+// c006_a999
diff --git a/tig-binary/Cargo.toml b/tig-binary/Cargo.toml
index 008ebbc..417d3b7 100644
--- a/tig-binary/Cargo.toml
+++ b/tig-binary/Cargo.toml
@@ -30,3 +30,5 @@ c004 = ["cuda", "tig-algorithms/c004", "tig-challenges/c004"]
 vector_search = ["c004"]
 c005 = ["cuda", "tig-algorithms/c005", "tig-challenges/c005"]
 hypergraph = ["c005"]
+c006 = ["cuda", "tig-algorithms/c006", "tig-challenges/c006"]
+nn_training = ["c006"]
diff --git a/tig-binary/scripts/build_algorithm b/tig-binary/scripts/build_algorithm
index cb6323f..8b38e33 100644
--- a/tig-binary/scripts/build_algorithm
+++ b/tig-binary/scripts/build_algorithm
@@ -41,8 +41,13 @@ case "$CHALLENGE" in
     build_so $ALGORITHM
     build_ptx $ALGORITHM
     ;;
+  nn_training)
+    echo "Building ALGORITHM '$ALGORITHM' for CHALLENGE 'nn_training'"
+    build_so $ALGORITHM
+    build_ptx $ALGORITHM --extra-cu-files tig-challenges/src/nn/kernels.cu
+    ;;
   *)
-    echo "Error: Invalid CHALLENGE value. Must be one of: satisfiability, knapsack, vehicle_routing, vector_search, hypergraph"
+    echo "Error: Invalid CHALLENGE value. Must be one of: satisfiability, knapsack, vehicle_routing, vector_search, hypergraph, nn_training"
     exit 1
     ;;
 esac
\ No newline at end of file
diff --git a/tig-binary/scripts/build_ptx b/tig-binary/scripts/build_ptx
index ed88b6a..4ff5bec 100644
--- a/tig-binary/scripts/build_ptx
+++ b/tig-binary/scripts/build_ptx
@@ -237,6 +237,7 @@ $NORMAL_EXIT:
 def main():
     parser = argparse.ArgumentParser(description='Compile PTX with injected runtime signature')
     parser.add_argument('algorithm', help='Algorithm name')
+    parser.add_argument('--extra-cu-files', nargs='*', default=[], help='Additional .cu files to include in the compilation')
     
     args = parser.parse_args()
 
@@ -272,6 +273,11 @@ def main():
             code = f.read() + "\n"
         with open(challenge_cu, 'r') as f:
             code += f.read() + "\n"
+        for extra_cu in args.extra_cu_files:
+            if not os.path.exists(extra_cu):
+                raise FileNotFoundError(f"Extra .cu file does not exist: {extra_cu}")
+            with open(extra_cu, 'r') as f:
+                code += f.read() + "\n"
         kernel_regex = r'(?:extern\s+"C"\s+__global__|__device__)\s+\w+\s+(?P<func>\w+)\s*\('
         kernels_to_ignore = [match.group('func') for match in re.finditer(kernel_regex, code)]
         with open(algorithm_cu, 'r') as f:
diff --git a/tig-challenges/Cargo.toml b/tig-challenges/Cargo.toml
index 7f42495..55ac0db 100644
--- a/tig-challenges/Cargo.toml
+++ b/tig-challenges/Cargo.toml
@@ -22,14 +22,15 @@ serde_json = { version = "1.0.113" }
 statrs = { version = "0.18.0" }
 
 [features]
-cuda = ["cudarc"]
 c001 = []
 satisfiability = ["c001"]
 c002 = []
 vehicle_routing = ["c002"]
 c003 = []
 knapsack = ["c003"]
-c004 = ["cuda"]
+c004 = ["cudarc"]
 vector_search = ["c004"]
-c005 = ["cuda"]
+c005 = ["cudarc"]
 hypergraph = ["c005"]
+c006 = ["cudarc", "cudarc/cublas", "cudarc/cudnn"]
+nn_training = ["c006"]
diff --git a/tig-challenges/src/lib.rs b/tig-challenges/src/lib.rs
index c900e80..c908d46 100644
--- a/tig-challenges/src/lib.rs
+++ b/tig-challenges/src/lib.rs
@@ -20,3 +20,9 @@ pub use vector_search as c004;
 pub mod hypergraph;
 #[cfg(feature = "c005")]
 pub use hypergraph as c005;
+#[cfg(feature = "c006")]
+pub(crate) mod nn;
+#[cfg(feature = "c006")]
+pub mod nn_training;
+#[cfg(feature = "c006")]
+pub use nn_training as c006;
diff --git a/tig-challenges/src/nn/batch_norm.rs b/tig-challenges/src/nn/batch_norm.rs
new file mode 100644
index 0000000..0bbb5cd
--- /dev/null
+++ b/tig-challenges/src/nn/batch_norm.rs
@@ -0,0 +1,338 @@
+use super::CudnnTensorDescriptor;
+use anyhow::Result;
+use cudarc::{
+    cudnn::{result::CudnnError, sys::*, Cudnn},
+    driver::{
+        CudaModule, CudaSlice, CudaStream, DevicePtr, DevicePtrMut, LaunchConfig, PushKernelArg,
+    },
+};
+use std::sync::Arc;
+
+const THREADS_PER_BLOCK: u32 = 1024;
+
+pub struct BatchNorm1d {
+    num_features: usize,
+    momentum: f64,
+    eps: f64,
+    pub weight: CudaSlice<f32>,
+    pub bias: CudaSlice<f32>,
+    pub running_mean: CudaSlice<f32>,
+    pub running_var: CudaSlice<f32>,
+    pub requires_grad: bool,
+    pub weight_grad: Option<CudaSlice<f32>>,
+    pub bias_grad: Option<CudaSlice<f32>>,
+    // cuDNN specific cache for backward pass
+    saved_mean: CudaSlice<f32>,
+    saved_inv_variance: CudaSlice<f32>,
+}
+
+impl BatchNorm1d {
+    pub fn new(
+        num_features: usize,
+        momentum: f64,
+        eps: f64,
+        requires_grad: bool,
+        stream: Arc<CudaStream>,
+    ) -> Result<Self> {
+        let weight = stream.memcpy_stod(&vec![1.0; num_features])?; // Init with ones (scale)
+        let bias = stream.alloc_zeros::<f32>(num_features)?;
+        let running_mean = stream.alloc_zeros::<f32>(num_features)?;
+        let running_var = stream.memcpy_stod(&vec![1.0; num_features])?; // Init with ones
+
+        let (weight_grad, bias_grad) = if requires_grad {
+            (
+                Some(stream.alloc_zeros::<f32>(num_features)?),
+                Some(stream.alloc_zeros::<f32>(num_features)?),
+            )
+        } else {
+            (None, None)
+        };
+
+        // These are populated by forward pass for use in backward pass
+        let saved_mean = stream.alloc_zeros::<f32>(num_features)?;
+        let saved_inv_variance = stream.alloc_zeros::<f32>(num_features)?;
+
+        Ok(Self {
+            num_features,
+            momentum,
+            eps,
+            weight,
+            bias,
+            running_mean,
+            running_var,
+            requires_grad,
+            weight_grad,
+            bias_grad,
+            saved_mean,
+            saved_inv_variance,
+        })
+    }
+
+    pub fn forward<I: DevicePtr<f32>>(
+        &mut self,
+        input: &I,
+        training: bool,
+        stream: Arc<CudaStream>,
+        cudnn: &Cudnn,
+    ) -> Result<CudaSlice<f32>> {
+        let batch_size = input.len() / self.num_features;
+        let mut output = stream.alloc_zeros::<f32>(input.len())?;
+
+        // For 1D batch norm, set up tensors as (N, C, 1, 1) but use SPATIAL mode
+        let mut x_desc = CudnnTensorDescriptor::new()?;
+        x_desc.set_4d(
+            cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
+            cudnnDataType_t::CUDNN_DATA_FLOAT,
+            batch_size as i32,
+            self.num_features as i32,
+            1,
+            1,
+        )?;
+        let mut y_desc = CudnnTensorDescriptor::new()?;
+        y_desc.set_4d(
+            cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
+            cudnnDataType_t::CUDNN_DATA_FLOAT,
+            batch_size as i32,
+            self.num_features as i32,
+            1,
+            1,
+        )?;
+        let mut derived_bn_desc = CudnnTensorDescriptor::new()?;
+        derived_bn_desc.set_4d(
+            cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
+            cudnnDataType_t::CUDNN_DATA_FLOAT,
+            1,
+            self.num_features as i32,
+            1,
+            1,
+        )?;
+
+        let alpha = 1.0f32;
+        let beta = 0.0f32;
+        let alpha_ptr = &alpha as *const f32 as *const std::ffi::c_void;
+        let beta_ptr = &beta as *const f32 as *const std::ffi::c_void;
+
+        // Use SPATIAL mode for 1D batch normalization
+        let mode = cudnnBatchNormMode_t::CUDNN_BATCHNORM_SPATIAL;
+
+        let status = if training {
+            unsafe {
+                cudnnBatchNormalizationForwardTraining(
+                    cudnn.handle,
+                    mode,
+                    alpha_ptr,
+                    beta_ptr,
+                    *x_desc,
+                    input.device_ptr(&stream).0 as *const _,
+                    *y_desc,
+                    output.device_ptr_mut(&stream).0 as *mut _,
+                    *derived_bn_desc,
+                    self.weight.device_ptr(&stream).0 as *const _,
+                    self.bias.device_ptr(&stream).0 as *const _,
+                    self.momentum,
+                    self.running_mean.device_ptr_mut(&stream).0 as *mut _,
+                    self.running_var.device_ptr_mut(&stream).0 as *mut _,
+                    self.eps,
+                    self.saved_mean.device_ptr_mut(&stream).0 as *mut _,
+                    self.saved_inv_variance.device_ptr_mut(&stream).0 as *mut _,
+                )
+            }
+        } else {
+            unsafe {
+                cudnnBatchNormalizationForwardInference(
+                    cudnn.handle,
+                    mode,
+                    alpha_ptr,
+                    beta_ptr,
+                    *x_desc,
+                    input.device_ptr(&stream).0 as *const _,
+                    *y_desc,
+                    output.device_ptr_mut(&stream).0 as *mut _,
+                    *derived_bn_desc,
+                    self.weight.device_ptr(&stream).0 as *const _,
+                    self.bias.device_ptr(&stream).0 as *const _,
+                    self.running_mean.device_ptr(&stream).0 as *const _,
+                    self.running_var.device_ptr(&stream).0 as *const _,
+                    self.eps,
+                )
+            }
+        };
+
+        // Debug: Check saved_mean and saved_inv_variance after forward pass if training
+        if training {
+            let mut saved_mean_sample = vec![0.0f32; 5.min(self.saved_mean.len())];
+            let mut saved_inv_variance_sample = vec![0.0f32; 5.min(self.saved_inv_variance.len())];
+            stream.memcpy_dtoh(
+                &self.saved_mean.slice(0..5.min(self.saved_mean.len())),
+                &mut saved_mean_sample,
+            )?;
+            stream.memcpy_dtoh(
+                &self
+                    .saved_inv_variance
+                    .slice(0..5.min(self.saved_inv_variance.len())),
+                &mut saved_inv_variance_sample,
+            )?;
+            stream.synchronize()?;
+        }
+
+        if status == cudnnStatus_t::CUDNN_STATUS_SUCCESS {
+            Ok(output)
+        } else {
+            Err(CudnnError(status).into())
+        }
+    }
+
+    pub fn backward(
+        &mut self,
+        input: &CudaSlice<f32>,
+        grad_output: &CudaSlice<f32>,
+        should_accumulate_gradients: bool,
+        stream: Arc<CudaStream>,
+        cudnn: &Cudnn,
+        _module: Arc<CudaModule>,
+    ) -> Result<CudaSlice<f32>> {
+        let batch_size = input.len() / self.num_features;
+        let mut grad_input = stream.alloc_zeros::<f32>(input.len())?;
+
+        // Set up tensor descriptors (same as forward pass)
+        let mut x_desc = CudnnTensorDescriptor::new()?;
+        x_desc.set_4d(
+            cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
+            cudnnDataType_t::CUDNN_DATA_FLOAT,
+            batch_size as i32,
+            self.num_features as i32,
+            1,
+            1,
+        )?;
+
+        let mut dy_desc = CudnnTensorDescriptor::new()?;
+        dy_desc.set_4d(
+            cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
+            cudnnDataType_t::CUDNN_DATA_FLOAT,
+            batch_size as i32,
+            self.num_features as i32,
+            1,
+            1,
+        )?;
+
+        let mut dx_desc = CudnnTensorDescriptor::new()?;
+        dx_desc.set_4d(
+            cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
+            cudnnDataType_t::CUDNN_DATA_FLOAT,
+            batch_size as i32,
+            self.num_features as i32,
+            1,
+            1,
+        )?;
+
+        let mut derived_bn_desc = CudnnTensorDescriptor::new()?;
+        derived_bn_desc.set_4d(
+            cudnnTensorFormat_t::CUDNN_TENSOR_NCHW,
+            cudnnDataType_t::CUDNN_DATA_FLOAT,
+            1,
+            self.num_features as i32,
+            1,
+            1,
+        )?;
+
+        let alpha_data = 1.0f32;
+        let beta_data = 0.0f32;
+        let alpha_param = 1.0f32;
+        let beta_param = if self.requires_grad && should_accumulate_gradients {
+            1.0f32
+        } else {
+            0.0f32
+        }; // Accumulate if trainable
+
+        let alpha_data_ptr = &alpha_data as *const f32 as *const std::ffi::c_void;
+        let beta_data_ptr = &beta_data as *const f32 as *const std::ffi::c_void;
+        let alpha_param_ptr = &alpha_param as *const f32 as *const std::ffi::c_void;
+        let beta_param_ptr = &beta_param as *const f32 as *const std::ffi::c_void;
+
+        // Use SPATIAL mode (same as forward)
+        let mode = cudnnBatchNormMode_t::CUDNN_BATCHNORM_SPATIAL;
+
+        let (mut temp_wg, mut temp_bg); // Must live long enough
+
+        let (wg, bg) = if self.requires_grad {
+            (
+                self.weight_grad.as_mut().unwrap(),
+                self.bias_grad.as_mut().unwrap(),
+            )
+        } else {
+            // Use temporary buffers if grads are not required for this layer
+            temp_wg = Some(stream.alloc_zeros::<f32>(self.num_features)?);
+            temp_bg = Some(stream.alloc_zeros::<f32>(self.num_features)?);
+            (temp_wg.as_mut().unwrap(), temp_bg.as_mut().unwrap())
+        };
+
+        let status = unsafe {
+            cudnnBatchNormalizationBackward(
+                cudnn.handle,
+                mode,
+                alpha_data_ptr,                                    // alphaDataDiff
+                beta_data_ptr,                                     // betaDataDiff
+                alpha_param_ptr,                                   // alphaParamDiff
+                beta_param_ptr, // betaParamDiff (use 1.0 to accumulate, 0.0 to overwrite)
+                *x_desc,        // xDesc
+                input.device_ptr(&stream).0 as *const _, // x
+                *dy_desc,       // dyDesc
+                grad_output.device_ptr(&stream).0 as *const _, // dy
+                *dx_desc,       // dxDesc
+                grad_input.device_ptr_mut(&stream).0 as *mut _, // dx
+                *derived_bn_desc, // dBnScaleBiasDesc
+                self.weight.device_ptr(&stream).0 as *const _, // bnScale
+                wg.device_ptr_mut(&stream).0 as *mut _, // dBnScaleResult (weight gradients)
+                bg.device_ptr_mut(&stream).0 as *mut _, // dBnBiasResult (bias gradients)
+                self.eps,       // epsilon
+                self.saved_mean.device_ptr(&stream).0 as *const _, // savedMean
+                self.saved_inv_variance.device_ptr(&stream).0 as *const _, // savedInvVariance
+            )
+        };
+
+        if status != cudnnStatus_t::CUDNN_STATUS_SUCCESS {
+            return Err(CudnnError(status).into());
+        }
+
+        Ok(grad_input)
+    }
+
+    pub fn zero_grad(&mut self, stream: Arc<CudaStream>, module: Arc<CudaModule>) -> Result<()> {
+        let zero_kernel = module.load_function("zero_out")?;
+        if let Some(wg) = self.weight_grad.as_mut() {
+            let n = wg.len() as u32;
+            let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+            let cfg = LaunchConfig {
+                grid_dim: (grid_dim, 1, 1),
+                block_dim: (THREADS_PER_BLOCK, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            let n_i32 = n as i32;
+            unsafe {
+                stream
+                    .launch_builder(&zero_kernel)
+                    .arg(wg)
+                    .arg(&n_i32)
+                    .launch(cfg)?;
+            };
+        }
+        if let Some(bg) = self.bias_grad.as_mut() {
+            let n = bg.len() as u32;
+            let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+            let cfg = LaunchConfig {
+                grid_dim: (grid_dim, 1, 1),
+                block_dim: (THREADS_PER_BLOCK, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            let n_i32 = n as i32;
+            unsafe {
+                stream
+                    .launch_builder(&zero_kernel)
+                    .arg(bg)
+                    .arg(&n_i32)
+                    .launch(cfg)?;
+            };
+        }
+        Ok(())
+    }
+}
diff --git a/tig-challenges/src/nn/kernels.cu b/tig-challenges/src/nn/kernels.cu
new file mode 100644
index 0000000..4578664
--- /dev/null
+++ b/tig-challenges/src/nn/kernels.cu
@@ -0,0 +1,157 @@
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <curand_kernel.h>
+#include <cfloat>
+#include <cmath>
+
+
+__device__ float relu(float x) {
+    return fmaxf(x, 0.0f);
+}
+
+extern "C" __global__ void init_linear_layer(
+    unsigned long long seed,
+    int out_features,
+    int in_features,
+    float* weights, // (out_features, in_features)
+    float* biases   // (out_features)
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= out_features * in_features) return;
+
+    curandState state;
+    curand_init(seed + idx, 0, 0, &state);
+
+    float fan_in = (float)in_features;
+    float fan_out = (float)out_features;
+    float limit = sqrtf(2.0f / (fan_in + fan_out)) * 0.5f;
+
+    weights[idx] = curand_uniform(&state) * 2.0f * limit - limit;
+
+    // Initialize biases to zero (can be done by one thread or memset)
+    if (idx < out_features) {
+        biases[idx] = 0.0f;
+    }
+}
+
+extern "C" __global__ void zero_out(float* data, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        data[idx] = 0.0f;
+    }
+}
+
+extern "C" __global__ void add_bias_forward(float* output, const float* bias, int batch_size, int features) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= batch_size * features) return;
+    
+    int feature_idx = idx % features;
+    output[idx] += bias[feature_idx];
+}
+
+extern "C" __global__ void activation_forward(float* data, int n) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+    data[idx] = relu(data[idx]);
+}
+
+extern "C" __global__ void loss_mse(
+    const float* output,     // (batch_size, out_features)
+    const float* target,     // (batch_size, out_features)
+    int batch_size,
+    int out_features,
+    float* grad_loss,        // (batch_size, out_features)
+    float* total_loss_out    // single element
+) {
+    extern __shared__ float s_loss[]; // size = blockDim.x
+    int tid = threadIdx.x;
+    int idx = blockIdx.x * blockDim.x + tid;
+
+    float element_loss_sum = 0.0f;
+    int n = batch_size * out_features;
+
+    for (int i = idx; i < n; i += gridDim.x * blockDim.x) {
+        float diff = output[i] - target[i];
+        grad_loss[i] = 2.0f * diff;// / batch_size;
+        element_loss_sum += diff * diff;
+    }
+    s_loss[tid] = element_loss_sum;
+    __syncthreads();
+
+    // Reduction in shared memory
+    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (tid < s) {
+            s_loss[tid] += s_loss[tid + s];
+        }
+        __syncthreads();
+    }
+    
+    if (tid == 0) {
+        atomicAdd(total_loss_out, s_loss[0] / n);
+    }
+}
+
+extern "C" __global__ void activation_backward(
+    const float* grad_in, 
+    const float* pre_act_vals, // Input to the activation function from forward pass
+    int n, 
+    float* grad_out
+) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n) {
+        // If pre-activation value was positive, gradient passes through. Otherwise, it's zero.
+        grad_out[i] = pre_act_vals[i] > 0.0f ? grad_in[i] : 0.0f;
+    }
+}
+
+extern "C" __global__ void backward_bias(
+    const float* grad_output, // (batch_size, out_features)
+    float* bias_grad,         // (out_features)
+    int batch_size,
+    int out_features
+) {
+    extern __shared__ float s_grad_sum[]; // size = out_features
+    int feature_idx = blockIdx.x; // Each block responsible for one feature
+    
+    if (feature_idx >= out_features) return;
+
+    float sum = 0.0f;
+    for(int i = threadIdx.x; i < batch_size; i += blockDim.x) {
+        sum += grad_output[i * out_features + feature_idx];
+    }
+    s_grad_sum[threadIdx.x] = sum;
+    __syncthreads();
+    
+    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+        if (threadIdx.x < s) {
+            s_grad_sum[threadIdx.x] += s_grad_sum[threadIdx.x + s];
+        }
+        __syncthreads();
+    }
+
+    if(threadIdx.x == 0) {
+        atomicAdd(&bias_grad[feature_idx], s_grad_sum[0]);
+    }
+}
+
+extern "C" __global__ void apply_parameter_updates_direct(
+    float* params,
+    const float* updates,
+    int n
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        params[idx] += updates[idx]; // Direct addition (updates already scaled)
+    }
+}
+
+extern "C" __global__ void copy_tensor(
+    float* dst,
+    const float* src,
+    int n
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < n) {
+        dst[idx] = src[idx];
+    }
+}
\ No newline at end of file
diff --git a/tig-challenges/src/nn/linear.rs b/tig-challenges/src/nn/linear.rs
new file mode 100644
index 0000000..6fb01aa
--- /dev/null
+++ b/tig-challenges/src/nn/linear.rs
@@ -0,0 +1,251 @@
+use anyhow::Result;
+use cudarc::{
+    cublas::{sys::cublasOperation_t, CudaBlas, Gemm, GemmConfig},
+    driver::{CudaModule, CudaSlice, CudaStream, DevicePtr, LaunchConfig, PushKernelArg},
+};
+use std::sync::Arc;
+
+const THREADS_PER_BLOCK: u32 = 1024;
+
+pub struct Linear {
+    pub in_features: usize,
+    pub out_features: usize,
+    pub weight: CudaSlice<f32>,
+    pub bias: CudaSlice<f32>,
+    pub requires_grad: bool,
+    pub weight_grad: Option<CudaSlice<f32>>,
+    pub bias_grad: Option<CudaSlice<f32>>,
+}
+
+impl Linear {
+    pub fn new(
+        in_features: usize,
+        out_features: usize,
+        requires_grad: bool,
+        stream: Arc<CudaStream>,
+    ) -> Result<Self> {
+        let weight = stream.alloc_zeros::<f32>(out_features * in_features)?;
+        let bias = stream.alloc_zeros::<f32>(out_features)?;
+        let (weight_grad, bias_grad) = if requires_grad {
+            (
+                Some(stream.alloc_zeros::<f32>(out_features * in_features)?),
+                Some(stream.alloc_zeros::<f32>(out_features)?),
+            )
+        } else {
+            (None, None)
+        };
+        Ok(Self {
+            in_features,
+            out_features,
+            weight,
+            bias,
+            requires_grad,
+            weight_grad,
+            bias_grad,
+        })
+    }
+
+    pub fn init_weights(
+        &mut self,
+        seed: [u8; 32],
+        stream: Arc<CudaStream>,
+        module: Arc<CudaModule>,
+    ) -> Result<()> {
+        let kernel = module.load_function("init_linear_layer")?;
+        let n = (self.out_features * self.in_features) as u32;
+        let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+        let cfg = LaunchConfig {
+            grid_dim: (grid_dim, 1, 1),
+            block_dim: (THREADS_PER_BLOCK, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let d_seed = stream.memcpy_stod(&seed)?;
+        let out_f = self.out_features as i32;
+        let in_f = self.in_features as i32;
+        unsafe {
+            stream
+                .launch_builder(&kernel)
+                .arg(&d_seed)
+                .arg(&out_f)
+                .arg(&in_f)
+                .arg(&mut self.weight)
+                .arg(&mut self.bias)
+                .launch(cfg)?
+        };
+        Ok(())
+    }
+
+    pub fn forward<I: DevicePtr<f32>>(
+        &self,
+        input_batch: &I,
+        stream: Arc<CudaStream>,
+        module: Arc<CudaModule>,
+        cublas: &CudaBlas,
+    ) -> Result<CudaSlice<f32>> {
+        let batch_size = input_batch.len() / self.in_features;
+        let mut output_batch = stream.alloc_zeros::<f32>(batch_size * self.out_features)?;
+
+        let gemm_config = GemmConfig {
+            transa: cublasOperation_t::CUBLAS_OP_N, // Don't transpose input
+            transb: cublasOperation_t::CUBLAS_OP_N, // Don't transpose weight
+            m: self.out_features as i32,            // Changed: rows of output
+            n: batch_size as i32,                   // Changed: cols of output
+            k: self.in_features as i32,             // Same: inner dimension
+            alpha: 1.0f32,
+            lda: self.out_features as i32, // Changed: leading dim of weight
+            ldb: self.in_features as i32,  // Changed: leading dim of input
+            beta: 0.0f32,
+            ldc: self.out_features as i32, // Changed: leading dim of output
+        };
+
+        unsafe {
+            cublas.gemm(gemm_config, &self.weight, input_batch, &mut output_batch)?;
+        }
+
+        let kernel = module.load_function("add_bias_forward")?;
+        let n = (batch_size * self.out_features) as u32;
+        let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+        let cfg = LaunchConfig {
+            grid_dim: (grid_dim, 1, 1),
+            block_dim: (THREADS_PER_BLOCK, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        let bs = batch_size as i32;
+        let of = self.out_features as i32;
+        unsafe {
+            stream
+                .launch_builder(&kernel)
+                .arg(&mut output_batch)
+                .arg(&self.bias)
+                .arg(&bs)
+                .arg(&of)
+                .launch(cfg)?
+        };
+        Ok(output_batch)
+    }
+
+    pub fn backward(
+        &mut self,
+        input_from_cache: &CudaSlice<f32>,
+        grad_output_batch: &CudaSlice<f32>,
+        should_accumulate_gradients: bool,
+        stream: Arc<CudaStream>,
+        module: Arc<CudaModule>,
+        cublas: &CudaBlas,
+    ) -> Result<CudaSlice<f32>> {
+        let batch_size = input_from_cache.len() / self.in_features;
+
+        if self.requires_grad {
+            let wg = self.weight_grad.as_mut().unwrap();
+            let bg = self.bias_grad.as_mut().unwrap();
+
+            // Correctly computes dW = d(Y^T) * X^T for column-major layout.
+            // dW(out,in) = d(Y^T)(out,batch) * X(in,batch)^T
+            let gemm_config_wg = GemmConfig {
+                transa: cublasOperation_t::CUBLAS_OP_N,
+                transb: cublasOperation_t::CUBLAS_OP_T,
+                m: self.out_features as i32,
+                n: self.in_features as i32,
+                k: batch_size as i32,
+                alpha: 1.0f32,
+                lda: self.out_features as i32,
+                ldb: self.in_features as i32,
+                beta: if should_accumulate_gradients {
+                    1.0f32
+                } else {
+                    0.0f32
+                },
+                ldc: self.out_features as i32,
+            };
+            unsafe {
+                cublas.gemm(gemm_config_wg, grad_output_batch, input_from_cache, wg)?;
+            }
+
+            let kernel = module.load_function("backward_bias")?;
+            let threads_per_block = 256u32;
+            let grid_dim = (self.out_features as u32, 1, 1);
+            let cfg = LaunchConfig {
+                grid_dim: grid_dim,
+                block_dim: (threads_per_block, 1, 1),
+                shared_mem_bytes: threads_per_block * 4,
+            };
+            let bs = batch_size as i32;
+            let of = self.out_features as i32;
+            unsafe {
+                stream
+                    .launch_builder(&kernel)
+                    .arg(grad_output_batch)
+                    .arg(bg)
+                    .arg(&bs)
+                    .arg(&of)
+                    .launch(cfg)?
+            };
+        }
+
+        let mut grad_input_batch = stream.alloc_zeros::<f32>(batch_size * self.in_features)?;
+
+        // Correctly computes dX = W^T * d(Y^T) for column-major layout.
+        // dX(in,batch) = W(out,in)^T * d(Y^T)(out,batch)
+        let gemm_config_d_input = GemmConfig {
+            transa: cublasOperation_t::CUBLAS_OP_T,
+            transb: cublasOperation_t::CUBLAS_OP_N,
+            m: self.in_features as i32,
+            n: batch_size as i32,
+            k: self.out_features as i32,
+            alpha: 1.0f32,
+            lda: self.out_features as i32,
+            ldb: self.out_features as i32,
+            beta: 0.0f32,
+            ldc: self.in_features as i32,
+        };
+        unsafe {
+            cublas.gemm(
+                gemm_config_d_input,
+                &self.weight,
+                grad_output_batch,
+                &mut grad_input_batch,
+            )?;
+        }
+
+        Ok(grad_input_batch)
+    }
+
+    pub fn zero_grad(&mut self, stream: Arc<CudaStream>, module: Arc<CudaModule>) -> Result<()> {
+        let zero_kernel = module.load_function("zero_out")?;
+        if let Some(wg) = self.weight_grad.as_mut() {
+            let n = wg.len() as u32;
+            let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+            let cfg = LaunchConfig {
+                grid_dim: (grid_dim, 1, 1),
+                block_dim: (THREADS_PER_BLOCK, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            let n_i32 = n as i32;
+            unsafe {
+                stream
+                    .launch_builder(&zero_kernel)
+                    .arg(wg)
+                    .arg(&n_i32)
+                    .launch(cfg)?;
+            };
+        }
+        if let Some(bg) = self.bias_grad.as_mut() {
+            let n = bg.len() as u32;
+            let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+            let cfg = LaunchConfig {
+                grid_dim: (grid_dim, 1, 1),
+                block_dim: (THREADS_PER_BLOCK, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            let n_i32 = n as i32;
+            unsafe {
+                stream
+                    .launch_builder(&zero_kernel)
+                    .arg(bg)
+                    .arg(&n_i32)
+                    .launch(cfg)?;
+            };
+        }
+        Ok(())
+    }
+}
diff --git a/tig-challenges/src/nn/mlp.rs b/tig-challenges/src/nn/mlp.rs
new file mode 100644
index 0000000..d36acf1
--- /dev/null
+++ b/tig-challenges/src/nn/mlp.rs
@@ -0,0 +1,516 @@
+use super::{BatchNorm1d, Linear};
+use anyhow::Result;
+use cudarc::{
+    cublas::CudaBlas,
+    cudnn::Cudnn,
+    driver::{
+        CudaModule, CudaSlice, CudaStream, CudaView, DevicePtr, DeviceRepr, LaunchConfig,
+        PushKernelArg,
+    },
+};
+use rand::{prelude::*, rngs::StdRng};
+use std::sync::Arc;
+
+const THREADS_PER_BLOCK: u32 = 1024;
+
+pub struct MLP {
+    pub lin: Vec<Linear>,
+    pub bns: Vec<BatchNorm1d>,
+    pub layer_cnt: usize,
+}
+
+#[derive(Clone)]
+pub struct ForwardCache<T: DeviceRepr> {
+    pub input: CudaSlice<T>,
+    pub linear_output: CudaSlice<T>,
+    pub activated_output: Option<CudaSlice<T>>,
+    pub bn_input: Option<CudaSlice<T>>,
+}
+
+impl MLP {
+    pub fn new(
+        layer_sizes: &[usize],
+        frozen_layers: usize,
+        stream: Arc<CudaStream>,
+    ) -> Result<Self> {
+        let layer_cnt = layer_sizes.len() - 1;
+        let mut lin = Vec::with_capacity(layer_cnt);
+        let mut bns = Vec::with_capacity(layer_cnt - 1);
+
+        for l in 0..layer_cnt {
+            let requires_grad = l < layer_cnt.saturating_sub(frozen_layers);
+            lin.push(Linear::new(
+                layer_sizes[l],
+                layer_sizes[l + 1],
+                requires_grad,
+                stream.clone(),
+            )?);
+            if l < layer_cnt - 1 {
+                bns.push(BatchNorm1d::new(
+                    layer_sizes[l + 1],
+                    0.1,
+                    1e-5,
+                    requires_grad,
+                    stream.clone(),
+                )?);
+            }
+        }
+        Ok(Self {
+            lin,
+            bns,
+            layer_cnt,
+        })
+    }
+
+    pub fn init_weights(
+        &mut self,
+        seed: [u8; 32],
+        stream: Arc<CudaStream>,
+        module: Arc<CudaModule>,
+    ) -> Result<()> {
+        let mut rng = StdRng::from_seed(seed);
+        for layer in &mut self.lin {
+            layer.init_weights(rng.gen(), stream.clone(), module.clone())?;
+        }
+        Ok(())
+    }
+
+    pub fn zero_grad(&mut self, stream: Arc<CudaStream>, module: Arc<CudaModule>) -> Result<()> {
+        for layer in &mut self.lin {
+            layer.zero_grad(stream.clone(), module.clone())?;
+        }
+        for bn in &mut self.bns {
+            bn.zero_grad(stream.clone(), module.clone())?;
+        }
+        Ok(())
+    }
+
+    pub fn forward<I: DevicePtr<f32>>(
+        &mut self,
+        input: &I,
+        training: bool,
+        stream: Arc<CudaStream>,
+        module: Arc<CudaModule>,
+        cublas: &CudaBlas,
+        cudnn: &Cudnn,
+    ) -> Result<(CudaSlice<f32>, Vec<ForwardCache<f32>>)> {
+        let mut x = stream.alloc_zeros::<f32>(input.len())?;
+        stream.memcpy_dtod(input, &mut x)?;
+
+        let mut caches = Vec::with_capacity(self.layer_cnt);
+
+        for l in 0..self.layer_cnt {
+            let input_cache = x.clone();
+            let linear_output =
+                self.lin[l].forward(&input_cache, stream.clone(), module.clone(), cublas)?;
+
+            if l < self.layer_cnt - 1 {
+                let mut activated = linear_output.clone();
+                let act_fwd_kernel = module.load_function("activation_forward")?;
+                let n = activated.len() as u32;
+                let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+                let cfg = LaunchConfig {
+                    grid_dim: (grid_dim, 1, 1),
+                    block_dim: (THREADS_PER_BLOCK, 1, 1),
+                    shared_mem_bytes: 0,
+                };
+                let n_i32 = n as i32;
+                unsafe {
+                    stream
+                        .launch_builder(&act_fwd_kernel)
+                        .arg(&mut activated)
+                        .arg(&n_i32)
+                        .launch(cfg)?
+                };
+
+                let bn_input_cache = activated.clone();
+                let bn_output = self.bns[l].forward(&activated, training, stream.clone(), cudnn)?;
+
+                caches.push(ForwardCache {
+                    input: input_cache,
+                    linear_output,
+                    activated_output: Some(activated),
+                    bn_input: Some(bn_input_cache),
+                });
+                x = bn_output;
+            } else {
+                caches.push(ForwardCache {
+                    input: input_cache,
+                    linear_output: linear_output.clone(),
+                    activated_output: None,
+                    bn_input: None,
+                });
+                x = linear_output;
+            }
+        }
+        Ok((x, caches))
+    }
+
+    pub fn backward(
+        &mut self,
+        grad: &CudaSlice<f32>,
+        forward_caches: &[ForwardCache<f32>],
+        should_accumulate_gradients: bool,
+        stream: Arc<CudaStream>,
+        module: Arc<CudaModule>,
+        cublas: &CudaBlas,
+        cudnn: &Cudnn,
+    ) -> Result<()> {
+        let mut current_grad = grad.clone();
+
+        for i in (0..self.lin.len()).rev() {
+            let mut grad_to_pass_to_linear = current_grad.clone();
+
+            // For intermediate layers, backpropagate through BN and Activation in reverse order.
+            if i < self.bns.len() {
+                // Step 1: Backpropagate through BatchNorm.
+                // The input to the BN's forward pass was the *activated* output of the linear layer.
+                let bn_input = forward_caches[i].activated_output.as_ref().unwrap();
+                let grad_after_bn = self.bns[i].backward(
+                    bn_input,
+                    &current_grad,
+                    should_accumulate_gradients,
+                    stream.clone(),
+                    cudnn,
+                    module.clone(),
+                )?;
+
+                // Step 2: Backpropagate through Activation.
+                // The input to the activation's forward pass was the direct output of the linear layer.
+                let pre_activation_values = &forward_caches[i].linear_output;
+                let mut grad_after_activation = stream.alloc_zeros::<f32>(grad_after_bn.len())?;
+
+                let kernel = module.load_function("activation_backward")?;
+                let cfg = LaunchConfig {
+                    grid_dim: ((grad_after_bn.len() as u32 + 255) / 256, 1, 1),
+                    block_dim: (256, 1, 1),
+                    shared_mem_bytes: 0,
+                };
+
+                unsafe {
+                    stream
+                        .launch_builder(&kernel)
+                        .arg(&grad_after_bn)
+                        .arg(pre_activation_values)
+                        .arg(&(grad_after_bn.len() as i32))
+                        .arg(&mut grad_after_activation)
+                        .launch(cfg)?;
+                }
+
+                grad_to_pass_to_linear = grad_after_activation;
+            }
+
+            // Step 3: Backpropagate through the linear layer.
+            // The input to the linear layer's forward pass is stored in the cache for this layer.
+            let input_to_linear = &forward_caches[i].input;
+
+            let grad_after_linear = self.lin[i].backward(
+                input_to_linear,
+                &grad_to_pass_to_linear,
+                should_accumulate_gradients,
+                stream.clone(),
+                module.clone(),
+                cublas,
+            )?;
+            current_grad = grad_after_linear;
+        }
+        Ok(())
+    }
+
+    pub fn loss_and_grad(
+        &self,
+        output: &CudaSlice<f32>,
+        target: &CudaView<'_, f32>,
+        stream: Arc<CudaStream>,
+        module: Arc<CudaModule>,
+    ) -> Result<(CudaSlice<f32>, CudaSlice<f32>)> {
+        let mut grad = stream.alloc_zeros::<f32>(output.len())?;
+        let mut loss = stream.alloc_zeros::<f32>(1)?;
+        let loss_kernel = module.load_function("loss_mse")?;
+
+        let total_elements = output.len() as u32;
+        let threads_per_block = 256u32;
+        let grid_dim = (total_elements + threads_per_block - 1) / threads_per_block;
+
+        let cfg = LaunchConfig {
+            grid_dim: (grid_dim, 1, 1),
+            block_dim: (threads_per_block, 1, 1),
+            shared_mem_bytes: threads_per_block * 4,
+        };
+
+        let batch_size = (output.len() / self.lin.last().unwrap().out_features) as i32;
+        let out_features = self.lin.last().unwrap().out_features as i32;
+        unsafe {
+            stream
+                .launch_builder(&loss_kernel)
+                .arg(output)
+                .arg(target)
+                .arg(&batch_size)
+                .arg(&out_features)
+                .arg(&mut grad)
+                .arg(&mut loss)
+                .launch(cfg)?;
+        }
+        Ok((loss, grad))
+    }
+
+    /// Extract all model parameters into a flat vector of CudaSlices
+    pub fn extract_parameters(&self, _stream: Arc<CudaStream>) -> Result<Vec<CudaSlice<f32>>> {
+        let mut params = Vec::new();
+
+        // Linear layer parameters
+        for layer in &self.lin {
+            params.push(layer.weight.clone());
+            params.push(layer.bias.clone());
+        }
+
+        // BatchNorm parameters
+        for bn in &self.bns {
+            params.push(bn.weight.clone());
+            params.push(bn.bias.clone());
+            params.push(bn.running_mean.clone());
+            params.push(bn.running_var.clone());
+        }
+
+        Ok(params)
+    }
+
+    /// Extract all model gradients into a flat vector of CudaSlices
+    pub fn extract_gradients(&self, stream: Arc<CudaStream>) -> Result<Vec<CudaSlice<f32>>> {
+        let mut grads = Vec::new();
+
+        // Linear layer gradients
+        for layer in &self.lin {
+            if layer.requires_grad {
+                grads.push(layer.weight_grad.as_ref().unwrap().clone());
+                grads.push(layer.bias_grad.as_ref().unwrap().clone());
+            } else {
+                // Create zero tensors for non-trainable parameters
+                grads.push(stream.alloc_zeros::<f32>(layer.weight.len())?);
+                grads.push(stream.alloc_zeros::<f32>(layer.bias.len())?);
+            }
+        }
+
+        // BatchNorm gradients
+        for bn in &self.bns {
+            if bn.requires_grad {
+                grads.push(bn.weight_grad.as_ref().unwrap().clone());
+                grads.push(bn.bias_grad.as_ref().unwrap().clone());
+            } else {
+                grads.push(stream.alloc_zeros::<f32>(bn.weight.len())?);
+                grads.push(stream.alloc_zeros::<f32>(bn.bias.len())?);
+            }
+            // No gradients for running_mean and running_var
+            grads.push(stream.alloc_zeros::<f32>(bn.running_mean.len())?);
+            grads.push(stream.alloc_zeros::<f32>(bn.running_var.len())?);
+        }
+
+        Ok(grads)
+    }
+
+    /// Get parameter sizes for optimizer initialization
+    pub fn get_parameter_sizes(&self) -> Vec<usize> {
+        let mut sizes = Vec::new();
+
+        for layer in &self.lin {
+            sizes.push(layer.weight.len());
+            sizes.push(layer.bias.len());
+        }
+
+        for bn in &self.bns {
+            sizes.push(bn.weight.len());
+            sizes.push(bn.bias.len());
+            sizes.push(bn.running_mean.len());
+            sizes.push(bn.running_var.len());
+        }
+
+        sizes
+    }
+
+    /// Apply parameter updates from optimizer
+    pub fn apply_optimizer_updates(
+        &mut self,
+        updates: &[CudaSlice<f32>],
+        stream: Arc<CudaStream>,
+        module: Arc<CudaModule>,
+    ) -> Result<()> {
+        let kernel = module.load_function("apply_parameter_updates_direct")?;
+        let mut update_idx = 0;
+
+        // Apply to linear layers
+        for layer in &mut self.lin {
+            if layer.requires_grad {
+                // Weight update
+                let n = layer.weight.len() as u32;
+                let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+                let cfg = LaunchConfig {
+                    grid_dim: (grid_dim, 1, 1),
+                    block_dim: (THREADS_PER_BLOCK, 1, 1),
+                    shared_mem_bytes: 0,
+                };
+                let n_i32 = n as i32;
+                unsafe {
+                    stream
+                        .launch_builder(&kernel)
+                        .arg(&mut layer.weight)
+                        .arg(&updates[update_idx])
+                        .arg(&n_i32)
+                        .launch(cfg)?;
+                }
+            }
+            update_idx += 1;
+
+            if layer.requires_grad {
+                // Bias update
+                let n = layer.bias.len() as u32;
+                let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+                let cfg = LaunchConfig {
+                    grid_dim: (grid_dim, 1, 1),
+                    block_dim: (THREADS_PER_BLOCK, 1, 1),
+                    shared_mem_bytes: 0,
+                };
+                let n_i32 = n as i32;
+                unsafe {
+                    stream
+                        .launch_builder(&kernel)
+                        .arg(&mut layer.bias)
+                        .arg(&updates[update_idx])
+                        .arg(&n_i32)
+                        .launch(cfg)?;
+                }
+            }
+            update_idx += 1;
+        }
+
+        // Apply to BatchNorm layers
+        for bn in &mut self.bns {
+            if bn.requires_grad {
+                // Weight update
+                let n = bn.weight.len() as u32;
+                let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+                let cfg = LaunchConfig {
+                    grid_dim: (grid_dim, 1, 1),
+                    block_dim: (THREADS_PER_BLOCK, 1, 1),
+                    shared_mem_bytes: 0,
+                };
+                let n_i32 = n as i32;
+                unsafe {
+                    stream
+                        .launch_builder(&kernel)
+                        .arg(&mut bn.weight)
+                        .arg(&updates[update_idx])
+                        .arg(&n_i32)
+                        .launch(cfg)?;
+                }
+            }
+            update_idx += 1;
+
+            if bn.requires_grad {
+                // Bias update
+                let n = bn.bias.len() as u32;
+                let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+                let cfg = LaunchConfig {
+                    grid_dim: (grid_dim, 1, 1),
+                    block_dim: (THREADS_PER_BLOCK, 1, 1),
+                    shared_mem_bytes: 0,
+                };
+                let n_i32 = n as i32;
+                unsafe {
+                    stream
+                        .launch_builder(&kernel)
+                        .arg(&mut bn.bias)
+                        .arg(&updates[update_idx])
+                        .arg(&n_i32)
+                        .launch(cfg)?;
+                }
+            }
+            update_idx += 1;
+
+            // Skip running_mean and running_var (they're not trainable)
+            update_idx += 2;
+        }
+
+        Ok(())
+    }
+
+    /// Set model parameters from a vector of CudaSlices
+    pub fn set_parameters(
+        &mut self,
+        params: &[CudaSlice<f32>],
+        stream: Arc<CudaStream>,
+        module: Arc<CudaModule>,
+    ) -> Result<()> {
+        let copy_kernel = module.load_function("copy_tensor")?;
+        let mut param_idx = 0;
+
+        for layer in &mut self.lin {
+            // Copy weights
+            let n = layer.weight.len() as u32;
+            let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+            let cfg = LaunchConfig {
+                grid_dim: (grid_dim, 1, 1),
+                block_dim: (THREADS_PER_BLOCK, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            let n_i32 = n as i32;
+            unsafe {
+                stream
+                    .launch_builder(&copy_kernel)
+                    .arg(&mut layer.weight)
+                    .arg(&params[param_idx])
+                    .arg(&n_i32)
+                    .launch(cfg)?;
+            }
+            param_idx += 1;
+
+            // Copy biases
+            let n = layer.bias.len() as u32;
+            let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+            let cfg = LaunchConfig {
+                grid_dim: (grid_dim, 1, 1),
+                block_dim: (THREADS_PER_BLOCK, 1, 1),
+                shared_mem_bytes: 0,
+            };
+            let n_i32 = n as i32;
+            unsafe {
+                stream
+                    .launch_builder(&copy_kernel)
+                    .arg(&mut layer.bias)
+                    .arg(&params[param_idx])
+                    .arg(&n_i32)
+                    .launch(cfg)?;
+            }
+            param_idx += 1;
+        }
+
+        for bn in &mut self.bns {
+            // Copy BN parameters
+            for target in [
+                &mut bn.weight,
+                &mut bn.bias,
+                &mut bn.running_mean,
+                &mut bn.running_var,
+            ] {
+                let n = target.len() as u32;
+                let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+                let cfg = LaunchConfig {
+                    grid_dim: (grid_dim, 1, 1),
+                    block_dim: (THREADS_PER_BLOCK, 1, 1),
+                    shared_mem_bytes: 0,
+                };
+                let n_i32 = n as i32;
+                unsafe {
+                    stream
+                        .launch_builder(&copy_kernel)
+                        .arg(target)
+                        .arg(&params[param_idx])
+                        .arg(&n_i32)
+                        .launch(cfg)?;
+                }
+                param_idx += 1;
+            }
+        }
+
+        Ok(())
+    }
+}
diff --git a/tig-challenges/src/nn/mod.rs b/tig-challenges/src/nn/mod.rs
new file mode 100644
index 0000000..abdd018
--- /dev/null
+++ b/tig-challenges/src/nn/mod.rs
@@ -0,0 +1,8 @@
+mod batch_norm;
+pub use batch_norm::*;
+mod linear;
+pub use linear::*;
+mod mlp;
+pub use mlp::*;
+mod tensor;
+pub use tensor::*;
diff --git a/tig-challenges/src/nn/tensor.rs b/tig-challenges/src/nn/tensor.rs
new file mode 100644
index 0000000..cb8f32c
--- /dev/null
+++ b/tig-challenges/src/nn/tensor.rs
@@ -0,0 +1,49 @@
+use anyhow::Result;
+use cudarc::cudnn::{self, result::CudnnError};
+use std::ops::Deref;
+
+pub struct CudnnTensorDescriptor(cudnn::sys::cudnnTensorDescriptor_t);
+
+impl CudnnTensorDescriptor {
+    pub fn new() -> Result<Self, CudnnError> {
+        let mut desc = std::ptr::null_mut();
+        unsafe {
+            match cudnn::sys::cudnnCreateTensorDescriptor(&mut desc) {
+                cudnn::sys::cudnnStatus_t::CUDNN_STATUS_SUCCESS => Ok(Self(desc)),
+                e => Err(CudnnError(e)),
+            }
+        }
+    }
+
+    pub fn set_4d(
+        &mut self,
+        format: cudnn::sys::cudnnTensorFormat_t,
+        data_type: cudnn::sys::cudnnDataType_t,
+        n: i32,
+        c: i32,
+        h: i32,
+        w: i32,
+    ) -> Result<(), CudnnError> {
+        unsafe {
+            match cudnn::sys::cudnnSetTensor4dDescriptor(self.0, format, data_type, n, c, h, w) {
+                cudnn::sys::cudnnStatus_t::CUDNN_STATUS_SUCCESS => Ok(()),
+                e => Err(CudnnError(e)),
+            }
+        }
+    }
+}
+
+impl Deref for CudnnTensorDescriptor {
+    type Target = cudnn::sys::cudnnTensorDescriptor_t;
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl Drop for CudnnTensorDescriptor {
+    fn drop(&mut self) {
+        unsafe {
+            cudnn::sys::cudnnDestroyTensorDescriptor(self.0);
+        }
+    }
+}
diff --git a/tig-challenges/src/nn_training.cu b/tig-challenges/src/nn_training.cu
new file mode 100644
index 0000000..f02bddc
--- /dev/null
+++ b/tig-challenges/src/nn_training.cu
@@ -0,0 +1,115 @@
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <curand_kernel.h>
+#include <cfloat>
+#include <cmath>
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846f
+#endif
+
+__device__ float box_muller(curandState* state, float& z1) {
+    float u1, u2;
+    // Prevent u1 from being 0 to avoid log(0) -> -inf
+    do {
+        u1 = curand_uniform(state);
+    } while (u1 == 0.0f);
+    u2 = curand_uniform(state);
+    
+    float mag = sqrtf(-2.0f * logf(u1));
+    float z0 = mag * cosf(2.0f * M_PI * u2);
+    z1 = mag * sinf(2.0f * M_PI * u2);
+    return z0;
+}
+
+extern "C" __global__ void generate_rff_params(
+    unsigned char* seed,
+    int output_dims,
+    int input_dims,
+    int k_rff,
+    float lengthscale,
+    float* a_params, // (output_dims, k_rff)
+    float* b_params, // (output_dims, k_rff)
+    float* w_params  // (output_dims, k_rff, input_dims)
+) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= output_dims * k_rff) return;
+
+    curandState state;
+    curand_init(
+        *((unsigned long long*)seed) + idx,
+        0, 0, &state
+    );
+
+    int out_dim = idx / k_rff;
+    int k_idx = idx % k_rff;
+
+    // Box-muller generates two samples, cache one
+    float z1;
+    a_params[idx] = box_muller(&state, z1);
+    // Note: this is not perfectly efficient, could be improved by having half threads write z1
+    
+    b_params[idx] = curand_uniform(&state) * 2.0f * M_PI;
+
+    float lengthscale_inv_sq = 1.0f / (lengthscale * lengthscale);
+
+    for(int in_dim = 0; in_dim < input_dims; ++in_dim) {
+        int w_idx = out_dim * k_rff * input_dims + k_idx * input_dims + in_dim;
+        float z_w1;
+        w_params[w_idx] = lengthscale_inv_sq * box_muller(&state, z_w1);
+    }
+}
+
+
+extern "C" __global__ void generate_dataset(
+    unsigned char* seed,
+    int num_samples,
+    int input_dims,
+    int output_dims,
+    int k_rff,
+    float scaling_factor,
+    float noise_std,
+    const float* a_params,
+    const float* b_params,
+    const float* w_params,
+    float* out_inputs,         // (num_samples, input_dims)
+    float* out_targets_noisy,  // (num_samples, output_dims)
+    float* out_targets_true    // (num_samples, output_dims)
+) {
+    int sample_idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (sample_idx >= num_samples) return;
+
+    curandState state;
+    curand_init(
+        *((unsigned long long*)seed) + sample_idx + num_samples, // Offset seed
+        0, 0, &state
+    );
+    
+    // Generate input sample
+    for (int i = 0; i < input_dims; ++i) {
+        out_inputs[sample_idx * input_dims + i] = curand_uniform(&state) * 2.0f - 1.0f;
+    }
+
+    // Generate targets
+    for (int out_dim = 0; out_dim < output_dims; ++out_dim) {
+        float f_val = 0.0f;
+        for (int k_idx = 0; k_idx < k_rff; ++k_idx) {
+            float wx_sum = 0.0f;
+            for (int in_dim = 0; in_dim < input_dims; ++in_dim) {
+                float w = w_params[out_dim * k_rff * input_dims + k_idx * input_dims + in_dim];
+                float x = out_inputs[sample_idx * input_dims + in_dim];
+                wx_sum += w * x;
+            }
+            float b = b_params[out_dim * k_rff + k_idx];
+            float a = a_params[out_dim * k_rff + k_idx];
+            f_val += a * cosf(wx_sum + b);
+        }
+        f_val *= scaling_factor;
+        
+        float z_noise1;
+        float noise = noise_std * box_muller(&state, z_noise1);
+
+        out_targets_true[sample_idx * output_dims + out_dim] = f_val;
+        out_targets_noisy[sample_idx * output_dims + out_dim] = f_val + noise;
+    }
+}
\ No newline at end of file
diff --git a/tig-challenges/src/nn_training.rs b/tig-challenges/src/nn_training.rs
new file mode 100644
index 0000000..1fb7703
--- /dev/null
+++ b/tig-challenges/src/nn_training.rs
@@ -0,0 +1,648 @@
+use anyhow::{anyhow, Result};
+use cudarc::{
+    cublas::CudaBlas,
+    cudnn::Cudnn,
+    driver::{CudaModule, CudaSlice, CudaStream, CudaView, LaunchConfig, PushKernelArg},
+    runtime::sys::cudaDeviceProp,
+};
+use rand::{prelude::*, rngs::StdRng};
+use serde::{Deserialize, Serialize};
+use serde_json::{from_value, Map, Value};
+use std::{any::Any, sync::Arc};
+
+use crate::nn::MLP;
+
+const THREADS_PER_BLOCK: u32 = 1024;
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct Difficulty {
+    pub num_hidden_layers: usize,
+    pub accuracy_factor: u32,
+}
+
+impl From<Vec<i32>> for Difficulty {
+    fn from(arr: Vec<i32>) -> Self {
+        Self {
+            num_hidden_layers: arr[0] as usize,
+            accuracy_factor: arr[1] as u32,
+        }
+    }
+}
+impl Into<Vec<i32>> for Difficulty {
+    fn into(self) -> Vec<i32> {
+        vec![self.num_hidden_layers as i32, self.accuracy_factor as i32]
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct Solution {
+    pub weights: Vec<Vec<Vec<f32>>>,
+    pub biases: Vec<Vec<f32>>,
+    pub epochs_used: usize,
+    pub bn_weights: Vec<Vec<f32>>,
+    pub bn_biases: Vec<Vec<f32>>,
+    pub bn_running_means: Vec<Vec<f32>>,
+    pub bn_running_vars: Vec<Vec<f32>>,
+}
+
+impl TryFrom<Map<String, Value>> for Solution {
+    type Error = serde_json::Error;
+    fn try_from(v: Map<String, Value>) -> Result<Self, Self::Error> {
+        from_value(Value::Object(v))
+    }
+}
+
+pub struct Dataset {
+    pub inputs: CudaSlice<f32>,
+    pub targets_noisy: CudaSlice<f32>,
+    pub targets_true_f: CudaSlice<f32>,
+
+    pub train_size: usize,
+    pub validation_size: usize,
+    pub test_size: usize,
+
+    pub input_dims: usize,
+    pub output_dims: usize,
+}
+
+impl Dataset {
+    pub fn train_inputs(&self) -> CudaView<f32> {
+        self.inputs.slice(0..self.train_size * self.input_dims)
+    }
+    pub fn train_targets_noisy(&self) -> CudaView<f32> {
+        self.targets_noisy
+            .slice(0..self.train_size * self.output_dims)
+    }
+    pub fn train_targets_true_f(&self) -> CudaView<f32> {
+        self.targets_true_f
+            .slice(0..self.train_size * self.output_dims)
+    }
+    pub fn validation_inputs(&self) -> CudaView<f32> {
+        self.inputs.slice(
+            self.train_size * self.input_dims
+                ..(self.train_size + self.validation_size) * self.input_dims,
+        )
+    }
+    pub fn validation_targets_noisy(&self) -> CudaView<f32> {
+        self.targets_noisy.slice(
+            self.train_size * self.output_dims
+                ..(self.train_size + self.validation_size) * self.output_dims,
+        )
+    }
+    pub fn validation_targets_true_f(&self) -> CudaView<f32> {
+        self.targets_true_f.slice(
+            self.train_size * self.output_dims
+                ..(self.train_size + self.validation_size) * self.output_dims,
+        )
+    }
+    pub fn test_inputs(&self) -> CudaView<f32> {
+        self.inputs.slice(
+            (self.train_size + self.validation_size) * self.input_dims
+                ..(self.train_size + self.validation_size + self.test_size) * self.input_dims,
+        )
+    }
+    pub fn test_targets_noisy(&self) -> CudaView<f32> {
+        self.targets_noisy.slice(
+            (self.train_size + self.validation_size) * self.output_dims
+                ..(self.train_size + self.validation_size + self.test_size) * self.output_dims,
+        )
+    }
+    pub fn test_targets_true_f(&self) -> CudaView<f32> {
+        self.targets_true_f.slice(
+            (self.train_size + self.validation_size) * self.output_dims
+                ..(self.train_size + self.validation_size + self.test_size) * self.output_dims,
+        )
+    }
+}
+
+pub struct Challenge {
+    pub seed: [u8; 32],
+    pub difficulty: Difficulty,
+    pub hidden_layers_dims: usize,
+    pub batch_size: usize,
+    pub max_epochs: usize,
+    pub patience: usize,
+    pub min_loss_delta: f32,
+    pub num_frozen_layers: usize,
+    pub dataset: Dataset,
+}
+
+impl Challenge {
+    pub fn generate_instance(
+        seed: &[u8; 32],
+        difficulty: &Difficulty,
+        module: Arc<CudaModule>,
+        stream: Arc<CudaStream>,
+        _prop: &cudaDeviceProp,
+    ) -> Result<Self> {
+        const K_RFF: usize = 128;
+        const RFF_AMPLITUDE_PER_FUNC: f32 = 1.0;
+        const RFF_LENGTHSCALE_PER_INPUT_DIM: f32 = 0.3;
+        const NOISE_STD: f32 = 0.2;
+        const INPUT_DIMS: usize = 1;
+        const OUTPUT_DIMS: usize = 2;
+        const TRAIN_SIZE: usize = 1000;
+        const VALIDATION_SIZE: usize = 200;
+        const TEST_SIZE: usize = 250;
+        let scaling_factor = RFF_AMPLITUDE_PER_FUNC * (2.0 / K_RFF as f32).sqrt();
+
+        let d_seed = stream.memcpy_stod(seed)?;
+
+        // Allocate memory for RFF params
+        let mut a_params = stream.alloc_zeros::<f32>(OUTPUT_DIMS * K_RFF)?;
+        let mut b_params = stream.alloc_zeros::<f32>(OUTPUT_DIMS * K_RFF)?;
+        let mut w_params = stream.alloc_zeros::<f32>(OUTPUT_DIMS * K_RFF * INPUT_DIMS)?;
+
+        // Generate RFF params
+        let generate_rff_params_kernel = module.load_function("generate_rff_params")?;
+        let n = (OUTPUT_DIMS * K_RFF) as u32;
+        let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+        let cfg = LaunchConfig {
+            grid_dim: (grid_dim, 1, 1),
+            block_dim: (THREADS_PER_BLOCK, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            stream
+                .launch_builder(&generate_rff_params_kernel)
+                .arg(&d_seed)
+                .arg(&(OUTPUT_DIMS as i32))
+                .arg(&(INPUT_DIMS as i32))
+                .arg(&(K_RFF as i32))
+                .arg(&RFF_LENGTHSCALE_PER_INPUT_DIM)
+                .arg(&mut a_params)
+                .arg(&mut b_params)
+                .arg(&mut w_params)
+                .launch(cfg)?;
+        }
+
+        // Generate datasets
+        let generate_dataset_kernel = module.load_function("generate_dataset")?;
+
+        // Training data
+        let total_samples = TRAIN_SIZE + VALIDATION_SIZE + TEST_SIZE;
+        let mut inputs = stream.alloc_zeros::<f32>(total_samples * INPUT_DIMS)?;
+        let mut targets_noisy = stream.alloc_zeros::<f32>(total_samples * OUTPUT_DIMS)?;
+        let mut targets_true_f = stream.alloc_zeros::<f32>(total_samples * OUTPUT_DIMS)?;
+
+        let grid_dim = (total_samples as u32 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+        let cfg_train = LaunchConfig {
+            grid_dim: (grid_dim, 1, 1),
+            block_dim: (THREADS_PER_BLOCK, 1, 1),
+            shared_mem_bytes: 0,
+        };
+        unsafe {
+            stream
+                .launch_builder(&generate_dataset_kernel)
+                .arg(&d_seed)
+                .arg(&(total_samples as i32))
+                .arg(&(INPUT_DIMS as i32))
+                .arg(&(OUTPUT_DIMS as i32))
+                .arg(&K_RFF)
+                .arg(&scaling_factor)
+                .arg(&NOISE_STD)
+                .arg(&a_params)
+                .arg(&b_params)
+                .arg(&w_params)
+                .arg(&mut inputs)
+                .arg(&mut targets_noisy)
+                .arg(&mut targets_true_f)
+                .launch(cfg_train)?;
+        }
+
+        stream.synchronize()?;
+
+        Ok(Self {
+            seed: *seed,
+            difficulty: difficulty.clone(),
+            hidden_layers_dims: 256,
+            batch_size: 128,
+            max_epochs: 1000,
+            patience: 50,
+            min_loss_delta: 1e-7,
+            num_frozen_layers: 2,
+            dataset: Dataset {
+                inputs,
+                targets_noisy,
+                targets_true_f,
+                train_size: TRAIN_SIZE,
+                validation_size: VALIDATION_SIZE,
+                test_size: TEST_SIZE,
+                input_dims: INPUT_DIMS,
+                output_dims: OUTPUT_DIMS,
+            },
+        })
+    }
+
+    pub fn verify_solution(
+        &self,
+        solution: &Solution,
+        module: Arc<CudaModule>,
+        stream: Arc<CudaStream>,
+        _prop: &cudaDeviceProp,
+    ) -> Result<()> {
+        let cublas = CudaBlas::new(stream.clone())?;
+        let cudnn = Cudnn::new(stream.clone())?;
+
+        let mut model = MLP::new(&self.layer_dims(), self.num_frozen_layers, stream.clone())?;
+
+        load_solution(&mut model, solution, stream.clone())?;
+
+        let (output, _) = model.forward(
+            &self.dataset.test_inputs(),
+            false,
+            stream.clone(),
+            module.clone(),
+            &cublas,
+            &cudnn,
+        )?;
+        let (loss, _) = model.loss_and_grad(
+            &output,
+            &&self.dataset.test_targets_noisy(),
+            stream.clone(),
+            module.clone(),
+        )?;
+
+        let avg_model_loss_on_test = stream.memcpy_dtov(&loss)?[0];
+
+        // Calculate baseline error epsilon_star_squared
+        let alpha = 4.0 - self.difficulty.accuracy_factor as f32 / 1000.0;
+
+        let y_h = stream.memcpy_dtov(&self.dataset.test_targets_noisy())?;
+        let f_h = stream.memcpy_dtov(&self.dataset.test_targets_true_f())?;
+        stream.synchronize()?;
+
+        let sum_sq_diff_true_vs_noisy: f32 = y_h
+            .iter()
+            .zip(f_h.iter())
+            .map(|(y, f)| (*y - *f).powi(2))
+            .sum();
+
+        let epsilon_star_squared =
+            (alpha / self.dataset.test_size as f32) * sum_sq_diff_true_vs_noisy;
+
+        if avg_model_loss_on_test <= epsilon_star_squared {
+            Ok(())
+        } else {
+            Err(anyhow!(
+                "Model test loss ({:.4e}) exceeds target baseline epsilon_star_squared ({:.4e})",
+                avg_model_loss_on_test,
+                epsilon_star_squared
+            ))
+        }
+    }
+
+    pub fn layer_dims(&self) -> Vec<usize> {
+        let mut layer_dims = vec![self.hidden_layers_dims; self.difficulty.num_hidden_layers];
+        layer_dims.insert(0, self.dataset.input_dims);
+        layer_dims.push(self.dataset.output_dims);
+        layer_dims
+    }
+}
+
+pub trait CudaOptimizerState: Any + Send + Sync {
+    fn as_any(&self) -> &dyn Any;
+    fn as_any_mut(&mut self) -> &mut dyn Any;
+    fn box_clone(&self) -> Box<dyn CudaOptimizerState>;
+}
+
+impl Clone for Box<dyn CudaOptimizerState> {
+    fn clone(&self) -> Self {
+        self.box_clone()
+    }
+}
+
+/// Function type for initializing optimizer state
+pub type CudaOptimizerInitStateFn = fn(
+    seed: &[u8; 32],
+    param_sizes: &[usize], // Sizes of all parameter tensors
+    stream: Arc<CudaStream>,
+) -> Result<Box<dyn CudaOptimizerState>>;
+
+/// Function type for querying optimizer at specific parameters (like parameter prediction)
+pub type CudaOptimizerQueryAtParamsFn = fn(
+    optimizer_state: &dyn CudaOptimizerState,
+    model_params: &[CudaSlice<f32>],
+    gradients: Option<&[CudaSlice<f32>]>,
+    epoch: usize,
+    train_loss: Option<f32>,
+    val_loss: Option<f32>,
+    stream: Arc<CudaStream>,
+    module: Arc<CudaModule>,
+) -> Result<Option<Vec<CudaSlice<f32>>>>;
+
+/// Function type for optimizer step (computes parameter updates)
+pub type CudaOptimizerStepFn = fn(
+    optimizer_state: &mut dyn CudaOptimizerState,
+    gradients: &[CudaSlice<f32>],
+    epoch: usize,
+    train_loss: Option<f32>,
+    val_loss: Option<f32>,
+    stream: Arc<CudaStream>,
+    module: Arc<CudaModule>,
+) -> Result<Vec<CudaSlice<f32>>>;
+
+pub fn training_loop(
+    challenge: &Challenge,
+    module: Arc<CudaModule>,
+    stream: Arc<CudaStream>,
+    prop: &cudaDeviceProp,
+    optimizer_init_state: CudaOptimizerInitStateFn,
+    optimizer_query_at_params: CudaOptimizerQueryAtParamsFn,
+    optimizer_step: CudaOptimizerStepFn,
+) -> Result<(Solution, Vec<f32>, Vec<f32>)> {
+    let Challenge {
+        batch_size,
+        max_epochs,
+        min_loss_delta,
+        patience,
+        dataset:
+            Dataset {
+                train_size,
+                validation_size,
+                input_dims,
+                output_dims,
+                ..
+            },
+        ..
+    } = *challenge;
+    let cublas = CudaBlas::new(stream.clone())?;
+    let cudnn = Cudnn::new(stream.clone())?;
+
+    let mut model = MLP::new(
+        &challenge.layer_dims(),
+        challenge.num_frozen_layers,
+        stream.clone(),
+    )?;
+    model.init_weights(&challenge.seed, stream.clone(), module.clone())?;
+
+    // Initialize optimizer
+    let param_sizes = model.get_parameter_sizes();
+    let mut optimizer_state = optimizer_init_state(
+        challenge.seed,
+        &param_sizes,
+        stream.clone(),
+        module.clone(),
+        prop,
+    )?;
+
+    let mut lowest_loss = f32::INFINITY;
+    let mut _best_epoch = 0;
+    let mut epochs_no_improvement = 0;
+    let mut best_model_solution: Option<Solution> = None;
+    let mut prev_train_loss = None;
+    let mut prev_validation_loss = None;
+    let mut train_losses = Vec::with_capacity(max_epochs);
+    let mut validation_losses = Vec::with_capacity(max_epochs);
+
+    let num_train_batches = (train_size + batch_size - 1) / batch_size;
+    let num_val_batches = (validation_size + batch_size - 1) / batch_size;
+
+    // Initialize RNG for shuffling
+    let mut rng = StdRng::from_seed(challenge.seed);
+
+    // Copy training data to host for shuffled batch creation
+    let train_inputs = stream.memcpy_dtov(&challenge.dataset.train_inputs())?;
+    let train_targets = stream.memcpy_dtov(&challenge.dataset.train_targets_noisy())?;
+    let validation_inputs_d = challenge.dataset.validation_inputs();
+    let validation_targets_d = challenge.dataset.validation_targets_noisy();
+    stream.synchronize()?;
+
+    for epoch in 0..max_epochs {
+        // --- Shuffle training data indices each epoch ---
+        let mut train_indices: Vec<usize> = (0..train_size).collect();
+        train_indices.shuffle(&mut rng);
+
+        // --- Training Phase ---
+        let mut epoch_train_loss_sum = 0.0;
+        for i in 0..num_train_batches {
+            let batch_start_idx = i * batch_size;
+            let current_batch_size = (train_size - batch_start_idx).min(batch_size);
+            if current_batch_size == 0 {
+                continue;
+            }
+
+            model.zero_grad(stream.clone(), module.clone())?;
+
+            // Create shuffled batch data
+            let mut input_batch_data = vec![0.0f32; current_batch_size * input_dims];
+            let mut target_batch_data = vec![0.0f32; current_batch_size * output_dims];
+
+            // Gather shuffled batch data
+            for batch_offset in 0..current_batch_size {
+                let shuffled_sample_idx = train_indices[batch_start_idx + batch_offset];
+
+                // Copy input data for this sample
+                let input_start = shuffled_sample_idx * input_dims;
+                let batch_input_start = batch_offset * input_dims;
+                for d in 0..input_dims {
+                    input_batch_data[batch_input_start + d] = train_inputs[input_start + d];
+                }
+
+                // Copy target data for this sample
+                let target_start = shuffled_sample_idx * output_dims;
+                let batch_target_start = batch_offset * output_dims;
+                for d in 0..output_dims {
+                    target_batch_data[batch_target_start + d] = train_targets[target_start + d];
+                }
+            }
+
+            // Upload shuffled batch to GPU
+            let mut d_input_batch = stream.alloc_zeros::<f32>(current_batch_size * input_dims)?;
+            let mut d_target_batch = stream.alloc_zeros::<f32>(current_batch_size * output_dims)?;
+            stream.memcpy_htod(&input_batch_data, &mut d_input_batch)?;
+            stream.memcpy_htod(&target_batch_data, &mut d_target_batch)?;
+
+            // Query optimizer for parameter modifications before forward pass
+            let model_params = model.extract_parameters(stream.clone())?;
+            let original_params = if let Some(modified_params) = optimizer_query_at_params(
+                optimizer_state.as_ref(),
+                &model_params,
+                None,
+                epoch,
+                prev_train_loss,
+                prev_validation_loss,
+                stream.clone(),
+                module.clone(),
+                prop,
+            )? {
+                let backup = model_params.clone();
+                model.set_parameters(&modified_params, stream.clone(), module.clone())?;
+                Some(backup)
+            } else {
+                None
+            };
+
+            let (output, caches) = model.forward(
+                &d_input_batch,
+                true,
+                stream.clone(),
+                module.clone(),
+                &cublas,
+                &cudnn,
+            )?;
+            let (loss, grad) = model.loss_and_grad(
+                &output,
+                &d_target_batch.as_view(),
+                stream.clone(),
+                module.clone(),
+            )?;
+            model.backward(
+                &grad,
+                &caches,
+                false,
+                stream.clone(),
+                module.clone(),
+                &cublas,
+                &cudnn,
+            )?;
+
+            // Restore original parameters if they were modified
+            if let Some(params_to_restore) = original_params {
+                model.set_parameters(&params_to_restore, stream.clone(), module.clone())?;
+            }
+
+            // Get gradients and apply optimizer step
+            let gradients = model.extract_gradients(stream.clone())?;
+            let param_updates = optimizer_step(
+                optimizer_state.as_mut(),
+                &gradients,
+                epoch,
+                prev_train_loss,
+                prev_validation_loss,
+                stream.clone(),
+                module.clone(),
+                prop,
+            )?;
+
+            model.apply_optimizer_updates(&param_updates, stream.clone(), module.clone())?;
+
+            let batch_loss = stream.memcpy_dtov(&loss)?[0];
+            epoch_train_loss_sum += batch_loss * current_batch_size as f32;
+        }
+        stream.synchronize()?;
+
+        let avg_train_loss = epoch_train_loss_sum / train_size as f32;
+        prev_train_loss = Some(avg_train_loss);
+        train_losses.push(avg_train_loss);
+
+        // --- Validation Phase ---
+        let mut epoch_val_loss_sum = 0.0;
+        if validation_size > 0 {
+            for i in 0..num_val_batches {
+                let batch_start = i * batch_size;
+                let current_batch_size = (validation_size - batch_start).min(batch_size);
+                if current_batch_size == 0 {
+                    continue;
+                }
+
+                let d_input_batch = validation_inputs_d.slice(
+                    batch_start * input_dims..(batch_start + current_batch_size) * input_dims,
+                );
+                let d_target_batch = validation_targets_d.slice(
+                    batch_start * output_dims..(batch_start + current_batch_size) * output_dims,
+                );
+
+                let (output, _) = model.forward(
+                    &d_input_batch,
+                    false,
+                    stream.clone(),
+                    module.clone(),
+                    &cublas,
+                    &cudnn,
+                )?;
+                let (loss, _) = model.loss_and_grad(
+                    &output,
+                    &d_target_batch,
+                    stream.clone(),
+                    module.clone(),
+                )?;
+
+                let mut batch_loss_h = vec![0.0; 1];
+                stream.memcpy_dtoh(&loss, &mut batch_loss_h)?;
+                epoch_val_loss_sum += batch_loss_h[0] * current_batch_size as f32;
+            }
+        }
+        stream.synchronize()?;
+
+        let avg_val_loss = if validation_size > 0 {
+            epoch_val_loss_sum / validation_size as f32
+        } else {
+            avg_train_loss
+        };
+        prev_validation_loss = Some(avg_val_loss);
+        validation_losses.push(avg_val_loss);
+
+        // --- Early Stopping ---
+        if avg_val_loss < lowest_loss - min_loss_delta {
+            lowest_loss = avg_val_loss;
+            _best_epoch = epoch;
+            best_model_solution = Some(to_solution(&model, epoch + 1, stream.clone())?);
+            epochs_no_improvement = 0;
+        } else {
+            epochs_no_improvement += 1;
+            if epochs_no_improvement >= patience {
+                break;
+            }
+        }
+    }
+
+    stream.synchronize()?;
+
+    let solution = best_model_solution.ok_or_else(|| anyhow!("No valid solution found during training. Validation loss may have been NaN or never improved."))?;
+    Ok((solution, train_losses, validation_losses))
+}
+
+pub fn load_solution(mlp: &mut MLP, solution: &Solution, stream: Arc<CudaStream>) -> Result<()> {
+    for (i, layer) in mlp.lin.iter_mut().enumerate() {
+        let w_flat: Vec<f32> = solution.weights[i].iter().flatten().cloned().collect();
+        stream.memcpy_htod(&w_flat, &mut layer.weight)?;
+
+        stream.memcpy_htod(&solution.biases[i], &mut layer.bias)?;
+    }
+    for (i, bn) in mlp.bns.iter_mut().enumerate() {
+        stream.memcpy_htod(&solution.bn_weights[i], &mut bn.weight)?;
+        stream.memcpy_htod(&solution.bn_biases[i], &mut bn.bias)?;
+        stream.memcpy_htod(&solution.bn_running_means[i], &mut bn.running_mean)?;
+        stream.memcpy_htod(&solution.bn_running_vars[i], &mut bn.running_var)?;
+    }
+    stream.synchronize()?;
+    Ok(())
+}
+
+pub fn to_solution(mlp: &MLP, epochs_used: usize, stream: Arc<CudaStream>) -> Result<Solution> {
+    stream.synchronize()?;
+    let mut weights = Vec::new();
+    let mut biases = Vec::new();
+    for layer in &mlp.lin {
+        let mut w_h = vec![0.0; layer.weight.len()];
+        stream.memcpy_dtoh(&layer.weight, &mut w_h)?;
+        let mut b_h = vec![0.0; layer.bias.len()];
+        stream.memcpy_dtoh(&layer.bias, &mut b_h)?;
+
+        weights.push(w_h.chunks(layer.in_features).map(|c| c.to_vec()).collect());
+        biases.push(b_h);
+    }
+
+    let mut bn_weights = Vec::new();
+    let mut bn_biases = Vec::new();
+    let mut bn_running_means = Vec::new();
+    let mut bn_running_vars = Vec::new();
+
+    for bn in &mlp.bns {
+        bn_weights.push(stream.memcpy_dtov(&bn.weight)?);
+        bn_biases.push(stream.memcpy_dtov(&bn.bias)?);
+        bn_running_means.push(stream.memcpy_dtov(&bn.running_mean)?);
+        bn_running_vars.push(stream.memcpy_dtov(&bn.running_var)?);
+    }
+
+    Ok(Solution {
+        weights,
+        biases,
+        epochs_used,
+        bn_weights,
+        bn_biases,
+        bn_running_means,
+        bn_running_vars,
+    })
+}
diff --git a/tig-runtime/Cargo.toml b/tig-runtime/Cargo.toml
index 3802846..341d9e3 100644
--- a/tig-runtime/Cargo.toml
+++ b/tig-runtime/Cargo.toml
@@ -32,3 +32,5 @@ c004 = ["cuda", "tig-challenges/c004"]
 vector_search = ["c004"]
 c005 = ["cuda", "tig-challenges/c005"]
 hypergraph = ["c005"]
+c006 = ["cuda", "tig-challenges/c006"]
+nn_training = ["c006"]
diff --git a/tig-runtime/src/main.rs b/tig-runtime/src/main.rs
index 22ea326..533cd2e 100644
--- a/tig-runtime/src/main.rs
+++ b/tig-runtime/src/main.rs
@@ -296,6 +296,12 @@ pub fn compute_solution(
                 #[cfg(feature = "c005")]
                 dispatch_challenge!(c005, gpu)
             }
+            "c006" => {
+                #[cfg(not(feature = "c006"))]
+                panic!("tig-runtime was not compiled with '--features c006'");
+                #[cfg(feature = "c006")]
+                dispatch_challenge!(c006, gpu)
+            }
             _ => panic!("Unsupported challenge"),
         }
     };
diff --git a/tig-verifier/Cargo.toml b/tig-verifier/Cargo.toml
index c2f19c3..684ed13 100644
--- a/tig-verifier/Cargo.toml
+++ b/tig-verifier/Cargo.toml
@@ -30,3 +30,5 @@ c004 = ["cuda", "tig-challenges/c004"]
 vector_search = ["c004"]
 c005 = ["cuda", "tig-challenges/c005"]
 hypergraph = ["c005"]
+c006 = ["cuda", "tig-challenges/c006"]
+nn_training = ["c006"]
diff --git a/tig-verifier/src/main.rs b/tig-verifier/src/main.rs
index 3ed9154..5bf2867 100644
--- a/tig-verifier/src/main.rs
+++ b/tig-verifier/src/main.rs
@@ -187,6 +187,12 @@ pub fn verify_solution(
             #[cfg(feature = "c005")]
             dispatch_challenge!(c005, gpu)
         }
+        "c006" => {
+            #[cfg(not(feature = "c006"))]
+            panic!("tig-verifier was not compiled with '--features c006'");
+            #[cfg(feature = "c006")]
+            dispatch_challenge!(c006, gpu)
+        }
         _ => panic!("Unsupported challenge"),
     }