diff --git a/.github/workflows/build_algorithm.yml b/.github/workflows/build_algorithm.yml index 30004d3..14725d0 100644 --- a/.github/workflows/build_algorithm.yml +++ b/.github/workflows/build_algorithm.yml @@ -8,16 +8,13 @@ on: - 'knapsack/*' - 'vector_search/*' - 'hypergraph/*' + - 'nn_training/*' - 'test/satisfiability/*' - 'test/vehicle_routing/*' - 'test/knapsack/*' - 'test/vector_search/*' - 'test/hypergraph/*' - - 'dev/satisfiability/*' - - 'dev/vehicle_routing/*' - - 'dev/knapsack/*' - - 'dev/vector_search/*' - - 'dev/hypergraph/*' + - 'test/nn_training/*' jobs: init: diff --git a/scripts/test_algorithm b/scripts/test_algorithm index 0576753..3024056 100644 --- a/scripts/test_algorithm +++ b/scripts/test_algorithm @@ -95,7 +95,7 @@ f"""Library not found at {so_path}: "knapsack": "c003", "vector_search": "c004", "hypergraph": "c005", - "optimiser": "c006", + "nn_training": "c006", } challenge_id = challenge_ids[CHALLENGE] diff --git a/tig-algorithms/Cargo.toml b/tig-algorithms/Cargo.toml index f8d1d22..248c18e 100644 --- a/tig-algorithms/Cargo.toml +++ b/tig-algorithms/Cargo.toml @@ -23,14 +23,15 @@ tig-challenges = { path = "../tig-challenges" } crate-type = ["cdylib", "rlib"] [features] -cuda = ["cudarc"] c001 = ["tig-challenges/c001"] satisfiability = ["c001"] c002 = ["tig-challenges/c002"] vehicle_routing = ["c002"] c003 = ["tig-challenges/c003"] knapsack = ["c003"] -c004 = ["cuda", "tig-challenges/c004"] +c004 = ["cudarc", "tig-challenges/c004"] vector_search = ["c004"] -c005 = ["cuda", "tig-challenges/c005"] +c005 = ["cudarc", "tig-challenges/c005"] hypergraph = ["c005"] +c006 = ["cudarc", "tig-challenges/c006"] +optimizer = ["c006"] diff --git a/tig-algorithms/src/lib.rs b/tig-algorithms/src/lib.rs index 67a1f54..9e60adb 100644 --- a/tig-algorithms/src/lib.rs +++ b/tig-algorithms/src/lib.rs @@ -31,3 +31,7 @@ pub use vector_search as c004; pub mod hypergraph; #[cfg(feature = "c005")] pub use hypergraph as c005; +#[cfg(feature = "c006")] +pub mod nn_training; +#[cfg(feature = "c006")] +pub use nn_training as c006; diff --git a/tig-algorithms/src/nn_training/mod.rs b/tig-algorithms/src/nn_training/mod.rs new file mode 100644 index 0000000..78868ab --- /dev/null +++ b/tig-algorithms/src/nn_training/mod.rs @@ -0,0 +1,1997 @@ +// c006_a001 + +// c006_a002 + +// c006_a003 + +// c006_a004 + +// c006_a005 + +// c006_a006 + +// c006_a007 + +// c006_a008 + +// c006_a009 + +// c006_a010 + +// c006_a011 + +// c006_a012 + +// c006_a013 + +// c006_a014 + +// c006_a015 + +// c006_a016 + +// c006_a017 + +// c006_a018 + +// c006_a019 + +// c006_a020 + +// c006_a021 + +// c006_a022 + +// c006_a023 + +// c006_a024 + +// c006_a025 + +// c006_a026 + +// c006_a027 + +// c006_a028 + +// c006_a029 + +// c006_a030 + +// c006_a031 + +// c006_a032 + +// c006_a033 + +// c006_a034 + +// c006_a035 + +// c006_a036 + +// c006_a037 + +// c006_a038 + +// c006_a039 + +// c006_a040 + +// c006_a041 + +// c006_a042 + +// c006_a043 + +// c006_a044 + +// c006_a045 + +// c006_a046 + +// c006_a047 + +// c006_a048 + +// c006_a049 + +// c006_a050 + +// c006_a051 + +// c006_a052 + +// c006_a053 + +// c006_a054 + +// c006_a055 + +// c006_a056 + +// c006_a057 + +// c006_a058 + +// c006_a059 + +// c006_a060 + +// c006_a061 + +// c006_a062 + +// c006_a063 + +// c006_a064 + +// c006_a065 + +// c006_a066 + +// c006_a067 + +// c006_a068 + +// c006_a069 + +// c006_a070 + +// c006_a071 + +// c006_a072 + +// c006_a073 + +// c006_a074 + +// c006_a075 + +// c006_a076 + +// c006_a077 + +// c006_a078 + +// c006_a079 + +// c006_a080 + +// c006_a081 + +// c006_a082 + +// c006_a083 + +// c006_a084 + +// c006_a085 + +// c006_a086 + +// c006_a087 + +// c006_a088 + +// c006_a089 + +// c006_a090 + +// c006_a091 + +// c006_a092 + +// c006_a093 + +// c006_a094 + +// c006_a095 + +// c006_a096 + +// c006_a097 + +// c006_a098 + +// c006_a099 + +// c006_a100 + +// c006_a101 + +// c006_a102 + +// c006_a103 + +// c006_a104 + +// c006_a105 + +// c006_a106 + +// c006_a107 + +// c006_a108 + +// c006_a109 + +// c006_a110 + +// c006_a111 + +// c006_a112 + +// c006_a113 + +// c006_a114 + +// c006_a115 + +// c006_a116 + +// c006_a117 + +// c006_a118 + +// c006_a119 + +// c006_a120 + +// c006_a121 + +// c006_a122 + +// c006_a123 + +// c006_a124 + +// c006_a125 + +// c006_a126 + +// c006_a127 + +// c006_a128 + +// c006_a129 + +// c006_a130 + +// c006_a131 + +// c006_a132 + +// c006_a133 + +// c006_a134 + +// c006_a135 + +// c006_a136 + +// c006_a137 + +// c006_a138 + +// c006_a139 + +// c006_a140 + +// c006_a141 + +// c006_a142 + +// c006_a143 + +// c006_a144 + +// c006_a145 + +// c006_a146 + +// c006_a147 + +// c006_a148 + +// c006_a149 + +// c006_a150 + +// c006_a151 + +// c006_a152 + +// c006_a153 + +// c006_a154 + +// c006_a155 + +// c006_a156 + +// c006_a157 + +// c006_a158 + +// c006_a159 + +// c006_a160 + +// c006_a161 + +// c006_a162 + +// c006_a163 + +// c006_a164 + +// c006_a165 + +// c006_a166 + +// c006_a167 + +// c006_a168 + +// c006_a169 + +// c006_a170 + +// c006_a171 + +// c006_a172 + +// c006_a173 + +// c006_a174 + +// c006_a175 + +// c006_a176 + +// c006_a177 + +// c006_a178 + +// c006_a179 + +// c006_a180 + +// c006_a181 + +// c006_a182 + +// c006_a183 + +// c006_a184 + +// c006_a185 + +// c006_a186 + +// c006_a187 + +// c006_a188 + +// c006_a189 + +// c006_a190 + +// c006_a191 + +// c006_a192 + +// c006_a193 + +// c006_a194 + +// c006_a195 + +// c006_a196 + +// c006_a197 + +// c006_a198 + +// c006_a199 + +// c006_a200 + +// c006_a201 + +// c006_a202 + +// c006_a203 + +// c006_a204 + +// c006_a205 + +// c006_a206 + +// c006_a207 + +// c006_a208 + +// c006_a209 + +// c006_a210 + +// c006_a211 + +// c006_a212 + +// c006_a213 + +// c006_a214 + +// c006_a215 + +// c006_a216 + +// c006_a217 + +// c006_a218 + +// c006_a219 + +// c006_a220 + +// c006_a221 + +// c006_a222 + +// c006_a223 + +// c006_a224 + +// c006_a225 + +// c006_a226 + +// c006_a227 + +// c006_a228 + +// c006_a229 + +// c006_a230 + +// c006_a231 + +// c006_a232 + +// c006_a233 + +// c006_a234 + +// c006_a235 + +// c006_a236 + +// c006_a237 + +// c006_a238 + +// c006_a239 + +// c006_a240 + +// c006_a241 + +// c006_a242 + +// c006_a243 + +// c006_a244 + +// c006_a245 + +// c006_a246 + +// c006_a247 + +// c006_a248 + +// c006_a249 + +// c006_a250 + +// c006_a251 + +// c006_a252 + +// c006_a253 + +// c006_a254 + +// c006_a255 + +// c006_a256 + +// c006_a257 + +// c006_a258 + +// c006_a259 + +// c006_a260 + +// c006_a261 + +// c006_a262 + +// c006_a263 + +// c006_a264 + +// c006_a265 + +// c006_a266 + +// c006_a267 + +// c006_a268 + +// c006_a269 + +// c006_a270 + +// c006_a271 + +// c006_a272 + +// c006_a273 + +// c006_a274 + +// c006_a275 + +// c006_a276 + +// c006_a277 + +// c006_a278 + +// c006_a279 + +// c006_a280 + +// c006_a281 + +// c006_a282 + +// c006_a283 + +// c006_a284 + +// c006_a285 + +// c006_a286 + +// c006_a287 + +// c006_a288 + +// c006_a289 + +// c006_a290 + +// c006_a291 + +// c006_a292 + +// c006_a293 + +// c006_a294 + +// c006_a295 + +// c006_a296 + +// c006_a297 + +// c006_a298 + +// c006_a299 + +// c006_a300 + +// c006_a301 + +// c006_a302 + +// c006_a303 + +// c006_a304 + +// c006_a305 + +// c006_a306 + +// c006_a307 + +// c006_a308 + +// c006_a309 + +// c006_a310 + +// c006_a311 + +// c006_a312 + +// c006_a313 + +// c006_a314 + +// c006_a315 + +// c006_a316 + +// c006_a317 + +// c006_a318 + +// c006_a319 + +// c006_a320 + +// c006_a321 + +// c006_a322 + +// c006_a323 + +// c006_a324 + +// c006_a325 + +// c006_a326 + +// c006_a327 + +// c006_a328 + +// c006_a329 + +// c006_a330 + +// c006_a331 + +// c006_a332 + +// c006_a333 + +// c006_a334 + +// c006_a335 + +// c006_a336 + +// c006_a337 + +// c006_a338 + +// c006_a339 + +// c006_a340 + +// c006_a341 + +// c006_a342 + +// c006_a343 + +// c006_a344 + +// c006_a345 + +// c006_a346 + +// c006_a347 + +// c006_a348 + +// c006_a349 + +// c006_a350 + +// c006_a351 + +// c006_a352 + +// c006_a353 + +// c006_a354 + +// c006_a355 + +// c006_a356 + +// c006_a357 + +// c006_a358 + +// c006_a359 + +// c006_a360 + +// c006_a361 + +// c006_a362 + +// c006_a363 + +// c006_a364 + +// c006_a365 + +// c006_a366 + +// c006_a367 + +// c006_a368 + +// c006_a369 + +// c006_a370 + +// c006_a371 + +// c006_a372 + +// c006_a373 + +// c006_a374 + +// c006_a375 + +// c006_a376 + +// c006_a377 + +// c006_a378 + +// c006_a379 + +// c006_a380 + +// c006_a381 + +// c006_a382 + +// c006_a383 + +// c006_a384 + +// c006_a385 + +// c006_a386 + +// c006_a387 + +// c006_a388 + +// c006_a389 + +// c006_a390 + +// c006_a391 + +// c006_a392 + +// c006_a393 + +// c006_a394 + +// c006_a395 + +// c006_a396 + +// c006_a397 + +// c006_a398 + +// c006_a399 + +// c006_a400 + +// c006_a401 + +// c006_a402 + +// c006_a403 + +// c006_a404 + +// c006_a405 + +// c006_a406 + +// c006_a407 + +// c006_a408 + +// c006_a409 + +// c006_a410 + +// c006_a411 + +// c006_a412 + +// c006_a413 + +// c006_a414 + +// c006_a415 + +// c006_a416 + +// c006_a417 + +// c006_a418 + +// c006_a419 + +// c006_a420 + +// c006_a421 + +// c006_a422 + +// c006_a423 + +// c006_a424 + +// c006_a425 + +// c006_a426 + +// c006_a427 + +// c006_a428 + +// c006_a429 + +// c006_a430 + +// c006_a431 + +// c006_a432 + +// c006_a433 + +// c006_a434 + +// c006_a435 + +// c006_a436 + +// c006_a437 + +// c006_a438 + +// c006_a439 + +// c006_a440 + +// c006_a441 + +// c006_a442 + +// c006_a443 + +// c006_a444 + +// c006_a445 + +// c006_a446 + +// c006_a447 + +// c006_a448 + +// c006_a449 + +// c006_a450 + +// c006_a451 + +// c006_a452 + +// c006_a453 + +// c006_a454 + +// c006_a455 + +// c006_a456 + +// c006_a457 + +// c006_a458 + +// c006_a459 + +// c006_a460 + +// c006_a461 + +// c006_a462 + +// c006_a463 + +// c006_a464 + +// c006_a465 + +// c006_a466 + +// c006_a467 + +// c006_a468 + +// c006_a469 + +// c006_a470 + +// c006_a471 + +// c006_a472 + +// c006_a473 + +// c006_a474 + +// c006_a475 + +// c006_a476 + +// c006_a477 + +// c006_a478 + +// c006_a479 + +// c006_a480 + +// c006_a481 + +// c006_a482 + +// c006_a483 + +// c006_a484 + +// c006_a485 + +// c006_a486 + +// c006_a487 + +// c006_a488 + +// c006_a489 + +// c006_a490 + +// c006_a491 + +// c006_a492 + +// c006_a493 + +// c006_a494 + +// c006_a495 + +// c006_a496 + +// c006_a497 + +// c006_a498 + +// c006_a499 + +// c006_a500 + +// c006_a501 + +// c006_a502 + +// c006_a503 + +// c006_a504 + +// c006_a505 + +// c006_a506 + +// c006_a507 + +// c006_a508 + +// c006_a509 + +// c006_a510 + +// c006_a511 + +// c006_a512 + +// c006_a513 + +// c006_a514 + +// c006_a515 + +// c006_a516 + +// c006_a517 + +// c006_a518 + +// c006_a519 + +// c006_a520 + +// c006_a521 + +// c006_a522 + +// c006_a523 + +// c006_a524 + +// c006_a525 + +// c006_a526 + +// c006_a527 + +// c006_a528 + +// c006_a529 + +// c006_a530 + +// c006_a531 + +// c006_a532 + +// c006_a533 + +// c006_a534 + +// c006_a535 + +// c006_a536 + +// c006_a537 + +// c006_a538 + +// c006_a539 + +// c006_a540 + +// c006_a541 + +// c006_a542 + +// c006_a543 + +// c006_a544 + +// c006_a545 + +// c006_a546 + +// c006_a547 + +// c006_a548 + +// c006_a549 + +// c006_a550 + +// c006_a551 + +// c006_a552 + +// c006_a553 + +// c006_a554 + +// c006_a555 + +// c006_a556 + +// c006_a557 + +// c006_a558 + +// c006_a559 + +// c006_a560 + +// c006_a561 + +// c006_a562 + +// c006_a563 + +// c006_a564 + +// c006_a565 + +// c006_a566 + +// c006_a567 + +// c006_a568 + +// c006_a569 + +// c006_a570 + +// c006_a571 + +// c006_a572 + +// c006_a573 + +// c006_a574 + +// c006_a575 + +// c006_a576 + +// c006_a577 + +// c006_a578 + +// c006_a579 + +// c006_a580 + +// c006_a581 + +// c006_a582 + +// c006_a583 + +// c006_a584 + +// c006_a585 + +// c006_a586 + +// c006_a587 + +// c006_a588 + +// c006_a589 + +// c006_a590 + +// c006_a591 + +// c006_a592 + +// c006_a593 + +// c006_a594 + +// c006_a595 + +// c006_a596 + +// c006_a597 + +// c006_a598 + +// c006_a599 + +// c006_a600 + +// c006_a601 + +// c006_a602 + +// c006_a603 + +// c006_a604 + +// c006_a605 + +// c006_a606 + +// c006_a607 + +// c006_a608 + +// c006_a609 + +// c006_a610 + +// c006_a611 + +// c006_a612 + +// c006_a613 + +// c006_a614 + +// c006_a615 + +// c006_a616 + +// c006_a617 + +// c006_a618 + +// c006_a619 + +// c006_a620 + +// c006_a621 + +// c006_a622 + +// c006_a623 + +// c006_a624 + +// c006_a625 + +// c006_a626 + +// c006_a627 + +// c006_a628 + +// c006_a629 + +// c006_a630 + +// c006_a631 + +// c006_a632 + +// c006_a633 + +// c006_a634 + +// c006_a635 + +// c006_a636 + +// c006_a637 + +// c006_a638 + +// c006_a639 + +// c006_a640 + +// c006_a641 + +// c006_a642 + +// c006_a643 + +// c006_a644 + +// c006_a645 + +// c006_a646 + +// c006_a647 + +// c006_a648 + +// c006_a649 + +// c006_a650 + +// c006_a651 + +// c006_a652 + +// c006_a653 + +// c006_a654 + +// c006_a655 + +// c006_a656 + +// c006_a657 + +// c006_a658 + +// c006_a659 + +// c006_a660 + +// c006_a661 + +// c006_a662 + +// c006_a663 + +// c006_a664 + +// c006_a665 + +// c006_a666 + +// c006_a667 + +// c006_a668 + +// c006_a669 + +// c006_a670 + +// c006_a671 + +// c006_a672 + +// c006_a673 + +// c006_a674 + +// c006_a675 + +// c006_a676 + +// c006_a677 + +// c006_a678 + +// c006_a679 + +// c006_a680 + +// c006_a681 + +// c006_a682 + +// c006_a683 + +// c006_a684 + +// c006_a685 + +// c006_a686 + +// c006_a687 + +// c006_a688 + +// c006_a689 + +// c006_a690 + +// c006_a691 + +// c006_a692 + +// c006_a693 + +// c006_a694 + +// c006_a695 + +// c006_a696 + +// c006_a697 + +// c006_a698 + +// c006_a699 + +// c006_a700 + +// c006_a701 + +// c006_a702 + +// c006_a703 + +// c006_a704 + +// c006_a705 + +// c006_a706 + +// c006_a707 + +// c006_a708 + +// c006_a709 + +// c006_a710 + +// c006_a711 + +// c006_a712 + +// c006_a713 + +// c006_a714 + +// c006_a715 + +// c006_a716 + +// c006_a717 + +// c006_a718 + +// c006_a719 + +// c006_a720 + +// c006_a721 + +// c006_a722 + +// c006_a723 + +// c006_a724 + +// c006_a725 + +// c006_a726 + +// c006_a727 + +// c006_a728 + +// c006_a729 + +// c006_a730 + +// c006_a731 + +// c006_a732 + +// c006_a733 + +// c006_a734 + +// c006_a735 + +// c006_a736 + +// c006_a737 + +// c006_a738 + +// c006_a739 + +// c006_a740 + +// c006_a741 + +// c006_a742 + +// c006_a743 + +// c006_a744 + +// c006_a745 + +// c006_a746 + +// c006_a747 + +// c006_a748 + +// c006_a749 + +// c006_a750 + +// c006_a751 + +// c006_a752 + +// c006_a753 + +// c006_a754 + +// c006_a755 + +// c006_a756 + +// c006_a757 + +// c006_a758 + +// c006_a759 + +// c006_a760 + +// c006_a761 + +// c006_a762 + +// c006_a763 + +// c006_a764 + +// c006_a765 + +// c006_a766 + +// c006_a767 + +// c006_a768 + +// c006_a769 + +// c006_a770 + +// c006_a771 + +// c006_a772 + +// c006_a773 + +// c006_a774 + +// c006_a775 + +// c006_a776 + +// c006_a777 + +// c006_a778 + +// c006_a779 + +// c006_a780 + +// c006_a781 + +// c006_a782 + +// c006_a783 + +// c006_a784 + +// c006_a785 + +// c006_a786 + +// c006_a787 + +// c006_a788 + +// c006_a789 + +// c006_a790 + +// c006_a791 + +// c006_a792 + +// c006_a793 + +// c006_a794 + +// c006_a795 + +// c006_a796 + +// c006_a797 + +// c006_a798 + +// c006_a799 + +// c006_a800 + +// c006_a801 + +// c006_a802 + +// c006_a803 + +// c006_a804 + +// c006_a805 + +// c006_a806 + +// c006_a807 + +// c006_a808 + +// c006_a809 + +// c006_a810 + +// c006_a811 + +// c006_a812 + +// c006_a813 + +// c006_a814 + +// c006_a815 + +// c006_a816 + +// c006_a817 + +// c006_a818 + +// c006_a819 + +// c006_a820 + +// c006_a821 + +// c006_a822 + +// c006_a823 + +// c006_a824 + +// c006_a825 + +// c006_a826 + +// c006_a827 + +// c006_a828 + +// c006_a829 + +// c006_a830 + +// c006_a831 + +// c006_a832 + +// c006_a833 + +// c006_a834 + +// c006_a835 + +// c006_a836 + +// c006_a837 + +// c006_a838 + +// c006_a839 + +// c006_a840 + +// c006_a841 + +// c006_a842 + +// c006_a843 + +// c006_a844 + +// c006_a845 + +// c006_a846 + +// c006_a847 + +// c006_a848 + +// c006_a849 + +// c006_a850 + +// c006_a851 + +// c006_a852 + +// c006_a853 + +// c006_a854 + +// c006_a855 + +// c006_a856 + +// c006_a857 + +// c006_a858 + +// c006_a859 + +// c006_a860 + +// c006_a861 + +// c006_a862 + +// c006_a863 + +// c006_a864 + +// c006_a865 + +// c006_a866 + +// c006_a867 + +// c006_a868 + +// c006_a869 + +// c006_a870 + +// c006_a871 + +// c006_a872 + +// c006_a873 + +// c006_a874 + +// c006_a875 + +// c006_a876 + +// c006_a877 + +// c006_a878 + +// c006_a879 + +// c006_a880 + +// c006_a881 + +// c006_a882 + +// c006_a883 + +// c006_a884 + +// c006_a885 + +// c006_a886 + +// c006_a887 + +// c006_a888 + +// c006_a889 + +// c006_a890 + +// c006_a891 + +// c006_a892 + +// c006_a893 + +// c006_a894 + +// c006_a895 + +// c006_a896 + +// c006_a897 + +// c006_a898 + +// c006_a899 + +// c006_a900 + +// c006_a901 + +// c006_a902 + +// c006_a903 + +// c006_a904 + +// c006_a905 + +// c006_a906 + +// c006_a907 + +// c006_a908 + +// c006_a909 + +// c006_a910 + +// c006_a911 + +// c006_a912 + +// c006_a913 + +// c006_a914 + +// c006_a915 + +// c006_a916 + +// c006_a917 + +// c006_a918 + +// c006_a919 + +// c006_a920 + +// c006_a921 + +// c006_a922 + +// c006_a923 + +// c006_a924 + +// c006_a925 + +// c006_a926 + +// c006_a927 + +// c006_a928 + +// c006_a929 + +// c006_a930 + +// c006_a931 + +// c006_a932 + +// c006_a933 + +// c006_a934 + +// c006_a935 + +// c006_a936 + +// c006_a937 + +// c006_a938 + +// c006_a939 + +// c006_a940 + +// c006_a941 + +// c006_a942 + +// c006_a943 + +// c006_a944 + +// c006_a945 + +// c006_a946 + +// c006_a947 + +// c006_a948 + +// c006_a949 + +// c006_a950 + +// c006_a951 + +// c006_a952 + +// c006_a953 + +// c006_a954 + +// c006_a955 + +// c006_a956 + +// c006_a957 + +// c006_a958 + +// c006_a959 + +// c006_a960 + +// c006_a961 + +// c006_a962 + +// c006_a963 + +// c006_a964 + +// c006_a965 + +// c006_a966 + +// c006_a967 + +// c006_a968 + +// c006_a969 + +// c006_a970 + +// c006_a971 + +// c006_a972 + +// c006_a973 + +// c006_a974 + +// c006_a975 + +// c006_a976 + +// c006_a977 + +// c006_a978 + +// c006_a979 + +// c006_a980 + +// c006_a981 + +// c006_a982 + +// c006_a983 + +// c006_a984 + +// c006_a985 + +// c006_a986 + +// c006_a987 + +// c006_a988 + +// c006_a989 + +// c006_a990 + +// c006_a991 + +// c006_a992 + +// c006_a993 + +// c006_a994 + +// c006_a995 + +// c006_a996 + +// c006_a997 + +// c006_a998 + +// c006_a999 diff --git a/tig-binary/Cargo.toml b/tig-binary/Cargo.toml index 008ebbc..417d3b7 100644 --- a/tig-binary/Cargo.toml +++ b/tig-binary/Cargo.toml @@ -30,3 +30,5 @@ c004 = ["cuda", "tig-algorithms/c004", "tig-challenges/c004"] vector_search = ["c004"] c005 = ["cuda", "tig-algorithms/c005", "tig-challenges/c005"] hypergraph = ["c005"] +c006 = ["cuda", "tig-algorithms/c006", "tig-challenges/c006"] +nn_training = ["c006"] diff --git a/tig-binary/scripts/build_algorithm b/tig-binary/scripts/build_algorithm index cb6323f..8b38e33 100644 --- a/tig-binary/scripts/build_algorithm +++ b/tig-binary/scripts/build_algorithm @@ -41,8 +41,13 @@ case "$CHALLENGE" in build_so $ALGORITHM build_ptx $ALGORITHM ;; + nn_training) + echo "Building ALGORITHM '$ALGORITHM' for CHALLENGE 'nn_training'" + build_so $ALGORITHM + build_ptx $ALGORITHM --extra-cu-files tig-challenges/src/nn/kernels.cu + ;; *) - echo "Error: Invalid CHALLENGE value. Must be one of: satisfiability, knapsack, vehicle_routing, vector_search, hypergraph" + echo "Error: Invalid CHALLENGE value. Must be one of: satisfiability, knapsack, vehicle_routing, vector_search, hypergraph, nn_training" exit 1 ;; esac \ No newline at end of file diff --git a/tig-binary/scripts/build_ptx b/tig-binary/scripts/build_ptx index ed88b6a..4ff5bec 100644 --- a/tig-binary/scripts/build_ptx +++ b/tig-binary/scripts/build_ptx @@ -237,6 +237,7 @@ $NORMAL_EXIT: def main(): parser = argparse.ArgumentParser(description='Compile PTX with injected runtime signature') parser.add_argument('algorithm', help='Algorithm name') + parser.add_argument('--extra-cu-files', nargs='*', default=[], help='Additional .cu files to include in the compilation') args = parser.parse_args() @@ -272,6 +273,11 @@ def main(): code = f.read() + "\n" with open(challenge_cu, 'r') as f: code += f.read() + "\n" + for extra_cu in args.extra_cu_files: + if not os.path.exists(extra_cu): + raise FileNotFoundError(f"Extra .cu file does not exist: {extra_cu}") + with open(extra_cu, 'r') as f: + code += f.read() + "\n" kernel_regex = r'(?:extern\s+"C"\s+__global__|__device__)\s+\w+\s+(?P\w+)\s*\(' kernels_to_ignore = [match.group('func') for match in re.finditer(kernel_regex, code)] with open(algorithm_cu, 'r') as f: diff --git a/tig-challenges/Cargo.toml b/tig-challenges/Cargo.toml index 7f42495..55ac0db 100644 --- a/tig-challenges/Cargo.toml +++ b/tig-challenges/Cargo.toml @@ -22,14 +22,15 @@ serde_json = { version = "1.0.113" } statrs = { version = "0.18.0" } [features] -cuda = ["cudarc"] c001 = [] satisfiability = ["c001"] c002 = [] vehicle_routing = ["c002"] c003 = [] knapsack = ["c003"] -c004 = ["cuda"] +c004 = ["cudarc"] vector_search = ["c004"] -c005 = ["cuda"] +c005 = ["cudarc"] hypergraph = ["c005"] +c006 = ["cudarc", "cudarc/cublas", "cudarc/cudnn"] +nn_training = ["c006"] diff --git a/tig-challenges/src/lib.rs b/tig-challenges/src/lib.rs index c900e80..c908d46 100644 --- a/tig-challenges/src/lib.rs +++ b/tig-challenges/src/lib.rs @@ -20,3 +20,9 @@ pub use vector_search as c004; pub mod hypergraph; #[cfg(feature = "c005")] pub use hypergraph as c005; +#[cfg(feature = "c006")] +pub(crate) mod nn; +#[cfg(feature = "c006")] +pub mod nn_training; +#[cfg(feature = "c006")] +pub use nn_training as c006; diff --git a/tig-challenges/src/nn/batch_norm.rs b/tig-challenges/src/nn/batch_norm.rs new file mode 100644 index 0000000..0bbb5cd --- /dev/null +++ b/tig-challenges/src/nn/batch_norm.rs @@ -0,0 +1,338 @@ +use super::CudnnTensorDescriptor; +use anyhow::Result; +use cudarc::{ + cudnn::{result::CudnnError, sys::*, Cudnn}, + driver::{ + CudaModule, CudaSlice, CudaStream, DevicePtr, DevicePtrMut, LaunchConfig, PushKernelArg, + }, +}; +use std::sync::Arc; + +const THREADS_PER_BLOCK: u32 = 1024; + +pub struct BatchNorm1d { + num_features: usize, + momentum: f64, + eps: f64, + pub weight: CudaSlice, + pub bias: CudaSlice, + pub running_mean: CudaSlice, + pub running_var: CudaSlice, + pub requires_grad: bool, + pub weight_grad: Option>, + pub bias_grad: Option>, + // cuDNN specific cache for backward pass + saved_mean: CudaSlice, + saved_inv_variance: CudaSlice, +} + +impl BatchNorm1d { + pub fn new( + num_features: usize, + momentum: f64, + eps: f64, + requires_grad: bool, + stream: Arc, + ) -> Result { + let weight = stream.memcpy_stod(&vec![1.0; num_features])?; // Init with ones (scale) + let bias = stream.alloc_zeros::(num_features)?; + let running_mean = stream.alloc_zeros::(num_features)?; + let running_var = stream.memcpy_stod(&vec![1.0; num_features])?; // Init with ones + + let (weight_grad, bias_grad) = if requires_grad { + ( + Some(stream.alloc_zeros::(num_features)?), + Some(stream.alloc_zeros::(num_features)?), + ) + } else { + (None, None) + }; + + // These are populated by forward pass for use in backward pass + let saved_mean = stream.alloc_zeros::(num_features)?; + let saved_inv_variance = stream.alloc_zeros::(num_features)?; + + Ok(Self { + num_features, + momentum, + eps, + weight, + bias, + running_mean, + running_var, + requires_grad, + weight_grad, + bias_grad, + saved_mean, + saved_inv_variance, + }) + } + + pub fn forward>( + &mut self, + input: &I, + training: bool, + stream: Arc, + cudnn: &Cudnn, + ) -> Result> { + let batch_size = input.len() / self.num_features; + let mut output = stream.alloc_zeros::(input.len())?; + + // For 1D batch norm, set up tensors as (N, C, 1, 1) but use SPATIAL mode + let mut x_desc = CudnnTensorDescriptor::new()?; + x_desc.set_4d( + cudnnTensorFormat_t::CUDNN_TENSOR_NCHW, + cudnnDataType_t::CUDNN_DATA_FLOAT, + batch_size as i32, + self.num_features as i32, + 1, + 1, + )?; + let mut y_desc = CudnnTensorDescriptor::new()?; + y_desc.set_4d( + cudnnTensorFormat_t::CUDNN_TENSOR_NCHW, + cudnnDataType_t::CUDNN_DATA_FLOAT, + batch_size as i32, + self.num_features as i32, + 1, + 1, + )?; + let mut derived_bn_desc = CudnnTensorDescriptor::new()?; + derived_bn_desc.set_4d( + cudnnTensorFormat_t::CUDNN_TENSOR_NCHW, + cudnnDataType_t::CUDNN_DATA_FLOAT, + 1, + self.num_features as i32, + 1, + 1, + )?; + + let alpha = 1.0f32; + let beta = 0.0f32; + let alpha_ptr = &alpha as *const f32 as *const std::ffi::c_void; + let beta_ptr = &beta as *const f32 as *const std::ffi::c_void; + + // Use SPATIAL mode for 1D batch normalization + let mode = cudnnBatchNormMode_t::CUDNN_BATCHNORM_SPATIAL; + + let status = if training { + unsafe { + cudnnBatchNormalizationForwardTraining( + cudnn.handle, + mode, + alpha_ptr, + beta_ptr, + *x_desc, + input.device_ptr(&stream).0 as *const _, + *y_desc, + output.device_ptr_mut(&stream).0 as *mut _, + *derived_bn_desc, + self.weight.device_ptr(&stream).0 as *const _, + self.bias.device_ptr(&stream).0 as *const _, + self.momentum, + self.running_mean.device_ptr_mut(&stream).0 as *mut _, + self.running_var.device_ptr_mut(&stream).0 as *mut _, + self.eps, + self.saved_mean.device_ptr_mut(&stream).0 as *mut _, + self.saved_inv_variance.device_ptr_mut(&stream).0 as *mut _, + ) + } + } else { + unsafe { + cudnnBatchNormalizationForwardInference( + cudnn.handle, + mode, + alpha_ptr, + beta_ptr, + *x_desc, + input.device_ptr(&stream).0 as *const _, + *y_desc, + output.device_ptr_mut(&stream).0 as *mut _, + *derived_bn_desc, + self.weight.device_ptr(&stream).0 as *const _, + self.bias.device_ptr(&stream).0 as *const _, + self.running_mean.device_ptr(&stream).0 as *const _, + self.running_var.device_ptr(&stream).0 as *const _, + self.eps, + ) + } + }; + + // Debug: Check saved_mean and saved_inv_variance after forward pass if training + if training { + let mut saved_mean_sample = vec![0.0f32; 5.min(self.saved_mean.len())]; + let mut saved_inv_variance_sample = vec![0.0f32; 5.min(self.saved_inv_variance.len())]; + stream.memcpy_dtoh( + &self.saved_mean.slice(0..5.min(self.saved_mean.len())), + &mut saved_mean_sample, + )?; + stream.memcpy_dtoh( + &self + .saved_inv_variance + .slice(0..5.min(self.saved_inv_variance.len())), + &mut saved_inv_variance_sample, + )?; + stream.synchronize()?; + } + + if status == cudnnStatus_t::CUDNN_STATUS_SUCCESS { + Ok(output) + } else { + Err(CudnnError(status).into()) + } + } + + pub fn backward( + &mut self, + input: &CudaSlice, + grad_output: &CudaSlice, + should_accumulate_gradients: bool, + stream: Arc, + cudnn: &Cudnn, + _module: Arc, + ) -> Result> { + let batch_size = input.len() / self.num_features; + let mut grad_input = stream.alloc_zeros::(input.len())?; + + // Set up tensor descriptors (same as forward pass) + let mut x_desc = CudnnTensorDescriptor::new()?; + x_desc.set_4d( + cudnnTensorFormat_t::CUDNN_TENSOR_NCHW, + cudnnDataType_t::CUDNN_DATA_FLOAT, + batch_size as i32, + self.num_features as i32, + 1, + 1, + )?; + + let mut dy_desc = CudnnTensorDescriptor::new()?; + dy_desc.set_4d( + cudnnTensorFormat_t::CUDNN_TENSOR_NCHW, + cudnnDataType_t::CUDNN_DATA_FLOAT, + batch_size as i32, + self.num_features as i32, + 1, + 1, + )?; + + let mut dx_desc = CudnnTensorDescriptor::new()?; + dx_desc.set_4d( + cudnnTensorFormat_t::CUDNN_TENSOR_NCHW, + cudnnDataType_t::CUDNN_DATA_FLOAT, + batch_size as i32, + self.num_features as i32, + 1, + 1, + )?; + + let mut derived_bn_desc = CudnnTensorDescriptor::new()?; + derived_bn_desc.set_4d( + cudnnTensorFormat_t::CUDNN_TENSOR_NCHW, + cudnnDataType_t::CUDNN_DATA_FLOAT, + 1, + self.num_features as i32, + 1, + 1, + )?; + + let alpha_data = 1.0f32; + let beta_data = 0.0f32; + let alpha_param = 1.0f32; + let beta_param = if self.requires_grad && should_accumulate_gradients { + 1.0f32 + } else { + 0.0f32 + }; // Accumulate if trainable + + let alpha_data_ptr = &alpha_data as *const f32 as *const std::ffi::c_void; + let beta_data_ptr = &beta_data as *const f32 as *const std::ffi::c_void; + let alpha_param_ptr = &alpha_param as *const f32 as *const std::ffi::c_void; + let beta_param_ptr = &beta_param as *const f32 as *const std::ffi::c_void; + + // Use SPATIAL mode (same as forward) + let mode = cudnnBatchNormMode_t::CUDNN_BATCHNORM_SPATIAL; + + let (mut temp_wg, mut temp_bg); // Must live long enough + + let (wg, bg) = if self.requires_grad { + ( + self.weight_grad.as_mut().unwrap(), + self.bias_grad.as_mut().unwrap(), + ) + } else { + // Use temporary buffers if grads are not required for this layer + temp_wg = Some(stream.alloc_zeros::(self.num_features)?); + temp_bg = Some(stream.alloc_zeros::(self.num_features)?); + (temp_wg.as_mut().unwrap(), temp_bg.as_mut().unwrap()) + }; + + let status = unsafe { + cudnnBatchNormalizationBackward( + cudnn.handle, + mode, + alpha_data_ptr, // alphaDataDiff + beta_data_ptr, // betaDataDiff + alpha_param_ptr, // alphaParamDiff + beta_param_ptr, // betaParamDiff (use 1.0 to accumulate, 0.0 to overwrite) + *x_desc, // xDesc + input.device_ptr(&stream).0 as *const _, // x + *dy_desc, // dyDesc + grad_output.device_ptr(&stream).0 as *const _, // dy + *dx_desc, // dxDesc + grad_input.device_ptr_mut(&stream).0 as *mut _, // dx + *derived_bn_desc, // dBnScaleBiasDesc + self.weight.device_ptr(&stream).0 as *const _, // bnScale + wg.device_ptr_mut(&stream).0 as *mut _, // dBnScaleResult (weight gradients) + bg.device_ptr_mut(&stream).0 as *mut _, // dBnBiasResult (bias gradients) + self.eps, // epsilon + self.saved_mean.device_ptr(&stream).0 as *const _, // savedMean + self.saved_inv_variance.device_ptr(&stream).0 as *const _, // savedInvVariance + ) + }; + + if status != cudnnStatus_t::CUDNN_STATUS_SUCCESS { + return Err(CudnnError(status).into()); + } + + Ok(grad_input) + } + + pub fn zero_grad(&mut self, stream: Arc, module: Arc) -> Result<()> { + let zero_kernel = module.load_function("zero_out")?; + if let Some(wg) = self.weight_grad.as_mut() { + let n = wg.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(&zero_kernel) + .arg(wg) + .arg(&n_i32) + .launch(cfg)?; + }; + } + if let Some(bg) = self.bias_grad.as_mut() { + let n = bg.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(&zero_kernel) + .arg(bg) + .arg(&n_i32) + .launch(cfg)?; + }; + } + Ok(()) + } +} diff --git a/tig-challenges/src/nn/kernels.cu b/tig-challenges/src/nn/kernels.cu new file mode 100644 index 0000000..4578664 --- /dev/null +++ b/tig-challenges/src/nn/kernels.cu @@ -0,0 +1,157 @@ +#include +#include +#include +#include +#include + + +__device__ float relu(float x) { + return fmaxf(x, 0.0f); +} + +extern "C" __global__ void init_linear_layer( + unsigned long long seed, + int out_features, + int in_features, + float* weights, // (out_features, in_features) + float* biases // (out_features) +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= out_features * in_features) return; + + curandState state; + curand_init(seed + idx, 0, 0, &state); + + float fan_in = (float)in_features; + float fan_out = (float)out_features; + float limit = sqrtf(2.0f / (fan_in + fan_out)) * 0.5f; + + weights[idx] = curand_uniform(&state) * 2.0f * limit - limit; + + // Initialize biases to zero (can be done by one thread or memset) + if (idx < out_features) { + biases[idx] = 0.0f; + } +} + +extern "C" __global__ void zero_out(float* data, int n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + data[idx] = 0.0f; + } +} + +extern "C" __global__ void add_bias_forward(float* output, const float* bias, int batch_size, int features) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= batch_size * features) return; + + int feature_idx = idx % features; + output[idx] += bias[feature_idx]; +} + +extern "C" __global__ void activation_forward(float* data, int n) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + data[idx] = relu(data[idx]); +} + +extern "C" __global__ void loss_mse( + const float* output, // (batch_size, out_features) + const float* target, // (batch_size, out_features) + int batch_size, + int out_features, + float* grad_loss, // (batch_size, out_features) + float* total_loss_out // single element +) { + extern __shared__ float s_loss[]; // size = blockDim.x + int tid = threadIdx.x; + int idx = blockIdx.x * blockDim.x + tid; + + float element_loss_sum = 0.0f; + int n = batch_size * out_features; + + for (int i = idx; i < n; i += gridDim.x * blockDim.x) { + float diff = output[i] - target[i]; + grad_loss[i] = 2.0f * diff;// / batch_size; + element_loss_sum += diff * diff; + } + s_loss[tid] = element_loss_sum; + __syncthreads(); + + // Reduction in shared memory + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (tid < s) { + s_loss[tid] += s_loss[tid + s]; + } + __syncthreads(); + } + + if (tid == 0) { + atomicAdd(total_loss_out, s_loss[0] / n); + } +} + +extern "C" __global__ void activation_backward( + const float* grad_in, + const float* pre_act_vals, // Input to the activation function from forward pass + int n, + float* grad_out +) { + int i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) { + // If pre-activation value was positive, gradient passes through. Otherwise, it's zero. + grad_out[i] = pre_act_vals[i] > 0.0f ? grad_in[i] : 0.0f; + } +} + +extern "C" __global__ void backward_bias( + const float* grad_output, // (batch_size, out_features) + float* bias_grad, // (out_features) + int batch_size, + int out_features +) { + extern __shared__ float s_grad_sum[]; // size = out_features + int feature_idx = blockIdx.x; // Each block responsible for one feature + + if (feature_idx >= out_features) return; + + float sum = 0.0f; + for(int i = threadIdx.x; i < batch_size; i += blockDim.x) { + sum += grad_output[i * out_features + feature_idx]; + } + s_grad_sum[threadIdx.x] = sum; + __syncthreads(); + + for (int s = blockDim.x / 2; s > 0; s >>= 1) { + if (threadIdx.x < s) { + s_grad_sum[threadIdx.x] += s_grad_sum[threadIdx.x + s]; + } + __syncthreads(); + } + + if(threadIdx.x == 0) { + atomicAdd(&bias_grad[feature_idx], s_grad_sum[0]); + } +} + +extern "C" __global__ void apply_parameter_updates_direct( + float* params, + const float* updates, + int n +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + params[idx] += updates[idx]; // Direct addition (updates already scaled) + } +} + +extern "C" __global__ void copy_tensor( + float* dst, + const float* src, + int n +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + dst[idx] = src[idx]; + } +} \ No newline at end of file diff --git a/tig-challenges/src/nn/linear.rs b/tig-challenges/src/nn/linear.rs new file mode 100644 index 0000000..6fb01aa --- /dev/null +++ b/tig-challenges/src/nn/linear.rs @@ -0,0 +1,251 @@ +use anyhow::Result; +use cudarc::{ + cublas::{sys::cublasOperation_t, CudaBlas, Gemm, GemmConfig}, + driver::{CudaModule, CudaSlice, CudaStream, DevicePtr, LaunchConfig, PushKernelArg}, +}; +use std::sync::Arc; + +const THREADS_PER_BLOCK: u32 = 1024; + +pub struct Linear { + pub in_features: usize, + pub out_features: usize, + pub weight: CudaSlice, + pub bias: CudaSlice, + pub requires_grad: bool, + pub weight_grad: Option>, + pub bias_grad: Option>, +} + +impl Linear { + pub fn new( + in_features: usize, + out_features: usize, + requires_grad: bool, + stream: Arc, + ) -> Result { + let weight = stream.alloc_zeros::(out_features * in_features)?; + let bias = stream.alloc_zeros::(out_features)?; + let (weight_grad, bias_grad) = if requires_grad { + ( + Some(stream.alloc_zeros::(out_features * in_features)?), + Some(stream.alloc_zeros::(out_features)?), + ) + } else { + (None, None) + }; + Ok(Self { + in_features, + out_features, + weight, + bias, + requires_grad, + weight_grad, + bias_grad, + }) + } + + pub fn init_weights( + &mut self, + seed: [u8; 32], + stream: Arc, + module: Arc, + ) -> Result<()> { + let kernel = module.load_function("init_linear_layer")?; + let n = (self.out_features * self.in_features) as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let d_seed = stream.memcpy_stod(&seed)?; + let out_f = self.out_features as i32; + let in_f = self.in_features as i32; + unsafe { + stream + .launch_builder(&kernel) + .arg(&d_seed) + .arg(&out_f) + .arg(&in_f) + .arg(&mut self.weight) + .arg(&mut self.bias) + .launch(cfg)? + }; + Ok(()) + } + + pub fn forward>( + &self, + input_batch: &I, + stream: Arc, + module: Arc, + cublas: &CudaBlas, + ) -> Result> { + let batch_size = input_batch.len() / self.in_features; + let mut output_batch = stream.alloc_zeros::(batch_size * self.out_features)?; + + let gemm_config = GemmConfig { + transa: cublasOperation_t::CUBLAS_OP_N, // Don't transpose input + transb: cublasOperation_t::CUBLAS_OP_N, // Don't transpose weight + m: self.out_features as i32, // Changed: rows of output + n: batch_size as i32, // Changed: cols of output + k: self.in_features as i32, // Same: inner dimension + alpha: 1.0f32, + lda: self.out_features as i32, // Changed: leading dim of weight + ldb: self.in_features as i32, // Changed: leading dim of input + beta: 0.0f32, + ldc: self.out_features as i32, // Changed: leading dim of output + }; + + unsafe { + cublas.gemm(gemm_config, &self.weight, input_batch, &mut output_batch)?; + } + + let kernel = module.load_function("add_bias_forward")?; + let n = (batch_size * self.out_features) as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let bs = batch_size as i32; + let of = self.out_features as i32; + unsafe { + stream + .launch_builder(&kernel) + .arg(&mut output_batch) + .arg(&self.bias) + .arg(&bs) + .arg(&of) + .launch(cfg)? + }; + Ok(output_batch) + } + + pub fn backward( + &mut self, + input_from_cache: &CudaSlice, + grad_output_batch: &CudaSlice, + should_accumulate_gradients: bool, + stream: Arc, + module: Arc, + cublas: &CudaBlas, + ) -> Result> { + let batch_size = input_from_cache.len() / self.in_features; + + if self.requires_grad { + let wg = self.weight_grad.as_mut().unwrap(); + let bg = self.bias_grad.as_mut().unwrap(); + + // Correctly computes dW = d(Y^T) * X^T for column-major layout. + // dW(out,in) = d(Y^T)(out,batch) * X(in,batch)^T + let gemm_config_wg = GemmConfig { + transa: cublasOperation_t::CUBLAS_OP_N, + transb: cublasOperation_t::CUBLAS_OP_T, + m: self.out_features as i32, + n: self.in_features as i32, + k: batch_size as i32, + alpha: 1.0f32, + lda: self.out_features as i32, + ldb: self.in_features as i32, + beta: if should_accumulate_gradients { + 1.0f32 + } else { + 0.0f32 + }, + ldc: self.out_features as i32, + }; + unsafe { + cublas.gemm(gemm_config_wg, grad_output_batch, input_from_cache, wg)?; + } + + let kernel = module.load_function("backward_bias")?; + let threads_per_block = 256u32; + let grid_dim = (self.out_features as u32, 1, 1); + let cfg = LaunchConfig { + grid_dim: grid_dim, + block_dim: (threads_per_block, 1, 1), + shared_mem_bytes: threads_per_block * 4, + }; + let bs = batch_size as i32; + let of = self.out_features as i32; + unsafe { + stream + .launch_builder(&kernel) + .arg(grad_output_batch) + .arg(bg) + .arg(&bs) + .arg(&of) + .launch(cfg)? + }; + } + + let mut grad_input_batch = stream.alloc_zeros::(batch_size * self.in_features)?; + + // Correctly computes dX = W^T * d(Y^T) for column-major layout. + // dX(in,batch) = W(out,in)^T * d(Y^T)(out,batch) + let gemm_config_d_input = GemmConfig { + transa: cublasOperation_t::CUBLAS_OP_T, + transb: cublasOperation_t::CUBLAS_OP_N, + m: self.in_features as i32, + n: batch_size as i32, + k: self.out_features as i32, + alpha: 1.0f32, + lda: self.out_features as i32, + ldb: self.out_features as i32, + beta: 0.0f32, + ldc: self.in_features as i32, + }; + unsafe { + cublas.gemm( + gemm_config_d_input, + &self.weight, + grad_output_batch, + &mut grad_input_batch, + )?; + } + + Ok(grad_input_batch) + } + + pub fn zero_grad(&mut self, stream: Arc, module: Arc) -> Result<()> { + let zero_kernel = module.load_function("zero_out")?; + if let Some(wg) = self.weight_grad.as_mut() { + let n = wg.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(&zero_kernel) + .arg(wg) + .arg(&n_i32) + .launch(cfg)?; + }; + } + if let Some(bg) = self.bias_grad.as_mut() { + let n = bg.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(&zero_kernel) + .arg(bg) + .arg(&n_i32) + .launch(cfg)?; + }; + } + Ok(()) + } +} diff --git a/tig-challenges/src/nn/mlp.rs b/tig-challenges/src/nn/mlp.rs new file mode 100644 index 0000000..d36acf1 --- /dev/null +++ b/tig-challenges/src/nn/mlp.rs @@ -0,0 +1,516 @@ +use super::{BatchNorm1d, Linear}; +use anyhow::Result; +use cudarc::{ + cublas::CudaBlas, + cudnn::Cudnn, + driver::{ + CudaModule, CudaSlice, CudaStream, CudaView, DevicePtr, DeviceRepr, LaunchConfig, + PushKernelArg, + }, +}; +use rand::{prelude::*, rngs::StdRng}; +use std::sync::Arc; + +const THREADS_PER_BLOCK: u32 = 1024; + +pub struct MLP { + pub lin: Vec, + pub bns: Vec, + pub layer_cnt: usize, +} + +#[derive(Clone)] +pub struct ForwardCache { + pub input: CudaSlice, + pub linear_output: CudaSlice, + pub activated_output: Option>, + pub bn_input: Option>, +} + +impl MLP { + pub fn new( + layer_sizes: &[usize], + frozen_layers: usize, + stream: Arc, + ) -> Result { + let layer_cnt = layer_sizes.len() - 1; + let mut lin = Vec::with_capacity(layer_cnt); + let mut bns = Vec::with_capacity(layer_cnt - 1); + + for l in 0..layer_cnt { + let requires_grad = l < layer_cnt.saturating_sub(frozen_layers); + lin.push(Linear::new( + layer_sizes[l], + layer_sizes[l + 1], + requires_grad, + stream.clone(), + )?); + if l < layer_cnt - 1 { + bns.push(BatchNorm1d::new( + layer_sizes[l + 1], + 0.1, + 1e-5, + requires_grad, + stream.clone(), + )?); + } + } + Ok(Self { + lin, + bns, + layer_cnt, + }) + } + + pub fn init_weights( + &mut self, + seed: [u8; 32], + stream: Arc, + module: Arc, + ) -> Result<()> { + let mut rng = StdRng::from_seed(seed); + for layer in &mut self.lin { + layer.init_weights(rng.gen(), stream.clone(), module.clone())?; + } + Ok(()) + } + + pub fn zero_grad(&mut self, stream: Arc, module: Arc) -> Result<()> { + for layer in &mut self.lin { + layer.zero_grad(stream.clone(), module.clone())?; + } + for bn in &mut self.bns { + bn.zero_grad(stream.clone(), module.clone())?; + } + Ok(()) + } + + pub fn forward>( + &mut self, + input: &I, + training: bool, + stream: Arc, + module: Arc, + cublas: &CudaBlas, + cudnn: &Cudnn, + ) -> Result<(CudaSlice, Vec>)> { + let mut x = stream.alloc_zeros::(input.len())?; + stream.memcpy_dtod(input, &mut x)?; + + let mut caches = Vec::with_capacity(self.layer_cnt); + + for l in 0..self.layer_cnt { + let input_cache = x.clone(); + let linear_output = + self.lin[l].forward(&input_cache, stream.clone(), module.clone(), cublas)?; + + if l < self.layer_cnt - 1 { + let mut activated = linear_output.clone(); + let act_fwd_kernel = module.load_function("activation_forward")?; + let n = activated.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(&act_fwd_kernel) + .arg(&mut activated) + .arg(&n_i32) + .launch(cfg)? + }; + + let bn_input_cache = activated.clone(); + let bn_output = self.bns[l].forward(&activated, training, stream.clone(), cudnn)?; + + caches.push(ForwardCache { + input: input_cache, + linear_output, + activated_output: Some(activated), + bn_input: Some(bn_input_cache), + }); + x = bn_output; + } else { + caches.push(ForwardCache { + input: input_cache, + linear_output: linear_output.clone(), + activated_output: None, + bn_input: None, + }); + x = linear_output; + } + } + Ok((x, caches)) + } + + pub fn backward( + &mut self, + grad: &CudaSlice, + forward_caches: &[ForwardCache], + should_accumulate_gradients: bool, + stream: Arc, + module: Arc, + cublas: &CudaBlas, + cudnn: &Cudnn, + ) -> Result<()> { + let mut current_grad = grad.clone(); + + for i in (0..self.lin.len()).rev() { + let mut grad_to_pass_to_linear = current_grad.clone(); + + // For intermediate layers, backpropagate through BN and Activation in reverse order. + if i < self.bns.len() { + // Step 1: Backpropagate through BatchNorm. + // The input to the BN's forward pass was the *activated* output of the linear layer. + let bn_input = forward_caches[i].activated_output.as_ref().unwrap(); + let grad_after_bn = self.bns[i].backward( + bn_input, + ¤t_grad, + should_accumulate_gradients, + stream.clone(), + cudnn, + module.clone(), + )?; + + // Step 2: Backpropagate through Activation. + // The input to the activation's forward pass was the direct output of the linear layer. + let pre_activation_values = &forward_caches[i].linear_output; + let mut grad_after_activation = stream.alloc_zeros::(grad_after_bn.len())?; + + let kernel = module.load_function("activation_backward")?; + let cfg = LaunchConfig { + grid_dim: ((grad_after_bn.len() as u32 + 255) / 256, 1, 1), + block_dim: (256, 1, 1), + shared_mem_bytes: 0, + }; + + unsafe { + stream + .launch_builder(&kernel) + .arg(&grad_after_bn) + .arg(pre_activation_values) + .arg(&(grad_after_bn.len() as i32)) + .arg(&mut grad_after_activation) + .launch(cfg)?; + } + + grad_to_pass_to_linear = grad_after_activation; + } + + // Step 3: Backpropagate through the linear layer. + // The input to the linear layer's forward pass is stored in the cache for this layer. + let input_to_linear = &forward_caches[i].input; + + let grad_after_linear = self.lin[i].backward( + input_to_linear, + &grad_to_pass_to_linear, + should_accumulate_gradients, + stream.clone(), + module.clone(), + cublas, + )?; + current_grad = grad_after_linear; + } + Ok(()) + } + + pub fn loss_and_grad( + &self, + output: &CudaSlice, + target: &CudaView<'_, f32>, + stream: Arc, + module: Arc, + ) -> Result<(CudaSlice, CudaSlice)> { + let mut grad = stream.alloc_zeros::(output.len())?; + let mut loss = stream.alloc_zeros::(1)?; + let loss_kernel = module.load_function("loss_mse")?; + + let total_elements = output.len() as u32; + let threads_per_block = 256u32; + let grid_dim = (total_elements + threads_per_block - 1) / threads_per_block; + + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (threads_per_block, 1, 1), + shared_mem_bytes: threads_per_block * 4, + }; + + let batch_size = (output.len() / self.lin.last().unwrap().out_features) as i32; + let out_features = self.lin.last().unwrap().out_features as i32; + unsafe { + stream + .launch_builder(&loss_kernel) + .arg(output) + .arg(target) + .arg(&batch_size) + .arg(&out_features) + .arg(&mut grad) + .arg(&mut loss) + .launch(cfg)?; + } + Ok((loss, grad)) + } + + /// Extract all model parameters into a flat vector of CudaSlices + pub fn extract_parameters(&self, _stream: Arc) -> Result>> { + let mut params = Vec::new(); + + // Linear layer parameters + for layer in &self.lin { + params.push(layer.weight.clone()); + params.push(layer.bias.clone()); + } + + // BatchNorm parameters + for bn in &self.bns { + params.push(bn.weight.clone()); + params.push(bn.bias.clone()); + params.push(bn.running_mean.clone()); + params.push(bn.running_var.clone()); + } + + Ok(params) + } + + /// Extract all model gradients into a flat vector of CudaSlices + pub fn extract_gradients(&self, stream: Arc) -> Result>> { + let mut grads = Vec::new(); + + // Linear layer gradients + for layer in &self.lin { + if layer.requires_grad { + grads.push(layer.weight_grad.as_ref().unwrap().clone()); + grads.push(layer.bias_grad.as_ref().unwrap().clone()); + } else { + // Create zero tensors for non-trainable parameters + grads.push(stream.alloc_zeros::(layer.weight.len())?); + grads.push(stream.alloc_zeros::(layer.bias.len())?); + } + } + + // BatchNorm gradients + for bn in &self.bns { + if bn.requires_grad { + grads.push(bn.weight_grad.as_ref().unwrap().clone()); + grads.push(bn.bias_grad.as_ref().unwrap().clone()); + } else { + grads.push(stream.alloc_zeros::(bn.weight.len())?); + grads.push(stream.alloc_zeros::(bn.bias.len())?); + } + // No gradients for running_mean and running_var + grads.push(stream.alloc_zeros::(bn.running_mean.len())?); + grads.push(stream.alloc_zeros::(bn.running_var.len())?); + } + + Ok(grads) + } + + /// Get parameter sizes for optimizer initialization + pub fn get_parameter_sizes(&self) -> Vec { + let mut sizes = Vec::new(); + + for layer in &self.lin { + sizes.push(layer.weight.len()); + sizes.push(layer.bias.len()); + } + + for bn in &self.bns { + sizes.push(bn.weight.len()); + sizes.push(bn.bias.len()); + sizes.push(bn.running_mean.len()); + sizes.push(bn.running_var.len()); + } + + sizes + } + + /// Apply parameter updates from optimizer + pub fn apply_optimizer_updates( + &mut self, + updates: &[CudaSlice], + stream: Arc, + module: Arc, + ) -> Result<()> { + let kernel = module.load_function("apply_parameter_updates_direct")?; + let mut update_idx = 0; + + // Apply to linear layers + for layer in &mut self.lin { + if layer.requires_grad { + // Weight update + let n = layer.weight.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(&kernel) + .arg(&mut layer.weight) + .arg(&updates[update_idx]) + .arg(&n_i32) + .launch(cfg)?; + } + } + update_idx += 1; + + if layer.requires_grad { + // Bias update + let n = layer.bias.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(&kernel) + .arg(&mut layer.bias) + .arg(&updates[update_idx]) + .arg(&n_i32) + .launch(cfg)?; + } + } + update_idx += 1; + } + + // Apply to BatchNorm layers + for bn in &mut self.bns { + if bn.requires_grad { + // Weight update + let n = bn.weight.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(&kernel) + .arg(&mut bn.weight) + .arg(&updates[update_idx]) + .arg(&n_i32) + .launch(cfg)?; + } + } + update_idx += 1; + + if bn.requires_grad { + // Bias update + let n = bn.bias.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(&kernel) + .arg(&mut bn.bias) + .arg(&updates[update_idx]) + .arg(&n_i32) + .launch(cfg)?; + } + } + update_idx += 1; + + // Skip running_mean and running_var (they're not trainable) + update_idx += 2; + } + + Ok(()) + } + + /// Set model parameters from a vector of CudaSlices + pub fn set_parameters( + &mut self, + params: &[CudaSlice], + stream: Arc, + module: Arc, + ) -> Result<()> { + let copy_kernel = module.load_function("copy_tensor")?; + let mut param_idx = 0; + + for layer in &mut self.lin { + // Copy weights + let n = layer.weight.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(©_kernel) + .arg(&mut layer.weight) + .arg(¶ms[param_idx]) + .arg(&n_i32) + .launch(cfg)?; + } + param_idx += 1; + + // Copy biases + let n = layer.bias.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(©_kernel) + .arg(&mut layer.bias) + .arg(¶ms[param_idx]) + .arg(&n_i32) + .launch(cfg)?; + } + param_idx += 1; + } + + for bn in &mut self.bns { + // Copy BN parameters + for target in [ + &mut bn.weight, + &mut bn.bias, + &mut bn.running_mean, + &mut bn.running_var, + ] { + let n = target.len() as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + let n_i32 = n as i32; + unsafe { + stream + .launch_builder(©_kernel) + .arg(target) + .arg(¶ms[param_idx]) + .arg(&n_i32) + .launch(cfg)?; + } + param_idx += 1; + } + } + + Ok(()) + } +} diff --git a/tig-challenges/src/nn/mod.rs b/tig-challenges/src/nn/mod.rs new file mode 100644 index 0000000..abdd018 --- /dev/null +++ b/tig-challenges/src/nn/mod.rs @@ -0,0 +1,8 @@ +mod batch_norm; +pub use batch_norm::*; +mod linear; +pub use linear::*; +mod mlp; +pub use mlp::*; +mod tensor; +pub use tensor::*; diff --git a/tig-challenges/src/nn/tensor.rs b/tig-challenges/src/nn/tensor.rs new file mode 100644 index 0000000..cb8f32c --- /dev/null +++ b/tig-challenges/src/nn/tensor.rs @@ -0,0 +1,49 @@ +use anyhow::Result; +use cudarc::cudnn::{self, result::CudnnError}; +use std::ops::Deref; + +pub struct CudnnTensorDescriptor(cudnn::sys::cudnnTensorDescriptor_t); + +impl CudnnTensorDescriptor { + pub fn new() -> Result { + let mut desc = std::ptr::null_mut(); + unsafe { + match cudnn::sys::cudnnCreateTensorDescriptor(&mut desc) { + cudnn::sys::cudnnStatus_t::CUDNN_STATUS_SUCCESS => Ok(Self(desc)), + e => Err(CudnnError(e)), + } + } + } + + pub fn set_4d( + &mut self, + format: cudnn::sys::cudnnTensorFormat_t, + data_type: cudnn::sys::cudnnDataType_t, + n: i32, + c: i32, + h: i32, + w: i32, + ) -> Result<(), CudnnError> { + unsafe { + match cudnn::sys::cudnnSetTensor4dDescriptor(self.0, format, data_type, n, c, h, w) { + cudnn::sys::cudnnStatus_t::CUDNN_STATUS_SUCCESS => Ok(()), + e => Err(CudnnError(e)), + } + } + } +} + +impl Deref for CudnnTensorDescriptor { + type Target = cudnn::sys::cudnnTensorDescriptor_t; + fn deref(&self) -> &Self::Target { + &self.0 + } +} + +impl Drop for CudnnTensorDescriptor { + fn drop(&mut self) { + unsafe { + cudnn::sys::cudnnDestroyTensorDescriptor(self.0); + } + } +} diff --git a/tig-challenges/src/nn_training.cu b/tig-challenges/src/nn_training.cu new file mode 100644 index 0000000..f02bddc --- /dev/null +++ b/tig-challenges/src/nn_training.cu @@ -0,0 +1,115 @@ +#include +#include +#include +#include +#include + +#ifndef M_PI +#define M_PI 3.14159265358979323846f +#endif + +__device__ float box_muller(curandState* state, float& z1) { + float u1, u2; + // Prevent u1 from being 0 to avoid log(0) -> -inf + do { + u1 = curand_uniform(state); + } while (u1 == 0.0f); + u2 = curand_uniform(state); + + float mag = sqrtf(-2.0f * logf(u1)); + float z0 = mag * cosf(2.0f * M_PI * u2); + z1 = mag * sinf(2.0f * M_PI * u2); + return z0; +} + +extern "C" __global__ void generate_rff_params( + unsigned char* seed, + int output_dims, + int input_dims, + int k_rff, + float lengthscale, + float* a_params, // (output_dims, k_rff) + float* b_params, // (output_dims, k_rff) + float* w_params // (output_dims, k_rff, input_dims) +) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= output_dims * k_rff) return; + + curandState state; + curand_init( + *((unsigned long long*)seed) + idx, + 0, 0, &state + ); + + int out_dim = idx / k_rff; + int k_idx = idx % k_rff; + + // Box-muller generates two samples, cache one + float z1; + a_params[idx] = box_muller(&state, z1); + // Note: this is not perfectly efficient, could be improved by having half threads write z1 + + b_params[idx] = curand_uniform(&state) * 2.0f * M_PI; + + float lengthscale_inv_sq = 1.0f / (lengthscale * lengthscale); + + for(int in_dim = 0; in_dim < input_dims; ++in_dim) { + int w_idx = out_dim * k_rff * input_dims + k_idx * input_dims + in_dim; + float z_w1; + w_params[w_idx] = lengthscale_inv_sq * box_muller(&state, z_w1); + } +} + + +extern "C" __global__ void generate_dataset( + unsigned char* seed, + int num_samples, + int input_dims, + int output_dims, + int k_rff, + float scaling_factor, + float noise_std, + const float* a_params, + const float* b_params, + const float* w_params, + float* out_inputs, // (num_samples, input_dims) + float* out_targets_noisy, // (num_samples, output_dims) + float* out_targets_true // (num_samples, output_dims) +) { + int sample_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (sample_idx >= num_samples) return; + + curandState state; + curand_init( + *((unsigned long long*)seed) + sample_idx + num_samples, // Offset seed + 0, 0, &state + ); + + // Generate input sample + for (int i = 0; i < input_dims; ++i) { + out_inputs[sample_idx * input_dims + i] = curand_uniform(&state) * 2.0f - 1.0f; + } + + // Generate targets + for (int out_dim = 0; out_dim < output_dims; ++out_dim) { + float f_val = 0.0f; + for (int k_idx = 0; k_idx < k_rff; ++k_idx) { + float wx_sum = 0.0f; + for (int in_dim = 0; in_dim < input_dims; ++in_dim) { + float w = w_params[out_dim * k_rff * input_dims + k_idx * input_dims + in_dim]; + float x = out_inputs[sample_idx * input_dims + in_dim]; + wx_sum += w * x; + } + float b = b_params[out_dim * k_rff + k_idx]; + float a = a_params[out_dim * k_rff + k_idx]; + f_val += a * cosf(wx_sum + b); + } + f_val *= scaling_factor; + + float z_noise1; + float noise = noise_std * box_muller(&state, z_noise1); + + out_targets_true[sample_idx * output_dims + out_dim] = f_val; + out_targets_noisy[sample_idx * output_dims + out_dim] = f_val + noise; + } +} \ No newline at end of file diff --git a/tig-challenges/src/nn_training.rs b/tig-challenges/src/nn_training.rs new file mode 100644 index 0000000..1fb7703 --- /dev/null +++ b/tig-challenges/src/nn_training.rs @@ -0,0 +1,648 @@ +use anyhow::{anyhow, Result}; +use cudarc::{ + cublas::CudaBlas, + cudnn::Cudnn, + driver::{CudaModule, CudaSlice, CudaStream, CudaView, LaunchConfig, PushKernelArg}, + runtime::sys::cudaDeviceProp, +}; +use rand::{prelude::*, rngs::StdRng}; +use serde::{Deserialize, Serialize}; +use serde_json::{from_value, Map, Value}; +use std::{any::Any, sync::Arc}; + +use crate::nn::MLP; + +const THREADS_PER_BLOCK: u32 = 1024; + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct Difficulty { + pub num_hidden_layers: usize, + pub accuracy_factor: u32, +} + +impl From> for Difficulty { + fn from(arr: Vec) -> Self { + Self { + num_hidden_layers: arr[0] as usize, + accuracy_factor: arr[1] as u32, + } + } +} +impl Into> for Difficulty { + fn into(self) -> Vec { + vec![self.num_hidden_layers as i32, self.accuracy_factor as i32] + } +} + +#[derive(Serialize, Deserialize, Debug, Clone)] +pub struct Solution { + pub weights: Vec>>, + pub biases: Vec>, + pub epochs_used: usize, + pub bn_weights: Vec>, + pub bn_biases: Vec>, + pub bn_running_means: Vec>, + pub bn_running_vars: Vec>, +} + +impl TryFrom> for Solution { + type Error = serde_json::Error; + fn try_from(v: Map) -> Result { + from_value(Value::Object(v)) + } +} + +pub struct Dataset { + pub inputs: CudaSlice, + pub targets_noisy: CudaSlice, + pub targets_true_f: CudaSlice, + + pub train_size: usize, + pub validation_size: usize, + pub test_size: usize, + + pub input_dims: usize, + pub output_dims: usize, +} + +impl Dataset { + pub fn train_inputs(&self) -> CudaView { + self.inputs.slice(0..self.train_size * self.input_dims) + } + pub fn train_targets_noisy(&self) -> CudaView { + self.targets_noisy + .slice(0..self.train_size * self.output_dims) + } + pub fn train_targets_true_f(&self) -> CudaView { + self.targets_true_f + .slice(0..self.train_size * self.output_dims) + } + pub fn validation_inputs(&self) -> CudaView { + self.inputs.slice( + self.train_size * self.input_dims + ..(self.train_size + self.validation_size) * self.input_dims, + ) + } + pub fn validation_targets_noisy(&self) -> CudaView { + self.targets_noisy.slice( + self.train_size * self.output_dims + ..(self.train_size + self.validation_size) * self.output_dims, + ) + } + pub fn validation_targets_true_f(&self) -> CudaView { + self.targets_true_f.slice( + self.train_size * self.output_dims + ..(self.train_size + self.validation_size) * self.output_dims, + ) + } + pub fn test_inputs(&self) -> CudaView { + self.inputs.slice( + (self.train_size + self.validation_size) * self.input_dims + ..(self.train_size + self.validation_size + self.test_size) * self.input_dims, + ) + } + pub fn test_targets_noisy(&self) -> CudaView { + self.targets_noisy.slice( + (self.train_size + self.validation_size) * self.output_dims + ..(self.train_size + self.validation_size + self.test_size) * self.output_dims, + ) + } + pub fn test_targets_true_f(&self) -> CudaView { + self.targets_true_f.slice( + (self.train_size + self.validation_size) * self.output_dims + ..(self.train_size + self.validation_size + self.test_size) * self.output_dims, + ) + } +} + +pub struct Challenge { + pub seed: [u8; 32], + pub difficulty: Difficulty, + pub hidden_layers_dims: usize, + pub batch_size: usize, + pub max_epochs: usize, + pub patience: usize, + pub min_loss_delta: f32, + pub num_frozen_layers: usize, + pub dataset: Dataset, +} + +impl Challenge { + pub fn generate_instance( + seed: &[u8; 32], + difficulty: &Difficulty, + module: Arc, + stream: Arc, + _prop: &cudaDeviceProp, + ) -> Result { + const K_RFF: usize = 128; + const RFF_AMPLITUDE_PER_FUNC: f32 = 1.0; + const RFF_LENGTHSCALE_PER_INPUT_DIM: f32 = 0.3; + const NOISE_STD: f32 = 0.2; + const INPUT_DIMS: usize = 1; + const OUTPUT_DIMS: usize = 2; + const TRAIN_SIZE: usize = 1000; + const VALIDATION_SIZE: usize = 200; + const TEST_SIZE: usize = 250; + let scaling_factor = RFF_AMPLITUDE_PER_FUNC * (2.0 / K_RFF as f32).sqrt(); + + let d_seed = stream.memcpy_stod(seed)?; + + // Allocate memory for RFF params + let mut a_params = stream.alloc_zeros::(OUTPUT_DIMS * K_RFF)?; + let mut b_params = stream.alloc_zeros::(OUTPUT_DIMS * K_RFF)?; + let mut w_params = stream.alloc_zeros::(OUTPUT_DIMS * K_RFF * INPUT_DIMS)?; + + // Generate RFF params + let generate_rff_params_kernel = module.load_function("generate_rff_params")?; + let n = (OUTPUT_DIMS * K_RFF) as u32; + let grid_dim = (n + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&generate_rff_params_kernel) + .arg(&d_seed) + .arg(&(OUTPUT_DIMS as i32)) + .arg(&(INPUT_DIMS as i32)) + .arg(&(K_RFF as i32)) + .arg(&RFF_LENGTHSCALE_PER_INPUT_DIM) + .arg(&mut a_params) + .arg(&mut b_params) + .arg(&mut w_params) + .launch(cfg)?; + } + + // Generate datasets + let generate_dataset_kernel = module.load_function("generate_dataset")?; + + // Training data + let total_samples = TRAIN_SIZE + VALIDATION_SIZE + TEST_SIZE; + let mut inputs = stream.alloc_zeros::(total_samples * INPUT_DIMS)?; + let mut targets_noisy = stream.alloc_zeros::(total_samples * OUTPUT_DIMS)?; + let mut targets_true_f = stream.alloc_zeros::(total_samples * OUTPUT_DIMS)?; + + let grid_dim = (total_samples as u32 + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK; + let cfg_train = LaunchConfig { + grid_dim: (grid_dim, 1, 1), + block_dim: (THREADS_PER_BLOCK, 1, 1), + shared_mem_bytes: 0, + }; + unsafe { + stream + .launch_builder(&generate_dataset_kernel) + .arg(&d_seed) + .arg(&(total_samples as i32)) + .arg(&(INPUT_DIMS as i32)) + .arg(&(OUTPUT_DIMS as i32)) + .arg(&K_RFF) + .arg(&scaling_factor) + .arg(&NOISE_STD) + .arg(&a_params) + .arg(&b_params) + .arg(&w_params) + .arg(&mut inputs) + .arg(&mut targets_noisy) + .arg(&mut targets_true_f) + .launch(cfg_train)?; + } + + stream.synchronize()?; + + Ok(Self { + seed: *seed, + difficulty: difficulty.clone(), + hidden_layers_dims: 256, + batch_size: 128, + max_epochs: 1000, + patience: 50, + min_loss_delta: 1e-7, + num_frozen_layers: 2, + dataset: Dataset { + inputs, + targets_noisy, + targets_true_f, + train_size: TRAIN_SIZE, + validation_size: VALIDATION_SIZE, + test_size: TEST_SIZE, + input_dims: INPUT_DIMS, + output_dims: OUTPUT_DIMS, + }, + }) + } + + pub fn verify_solution( + &self, + solution: &Solution, + module: Arc, + stream: Arc, + _prop: &cudaDeviceProp, + ) -> Result<()> { + let cublas = CudaBlas::new(stream.clone())?; + let cudnn = Cudnn::new(stream.clone())?; + + let mut model = MLP::new(&self.layer_dims(), self.num_frozen_layers, stream.clone())?; + + load_solution(&mut model, solution, stream.clone())?; + + let (output, _) = model.forward( + &self.dataset.test_inputs(), + false, + stream.clone(), + module.clone(), + &cublas, + &cudnn, + )?; + let (loss, _) = model.loss_and_grad( + &output, + &&self.dataset.test_targets_noisy(), + stream.clone(), + module.clone(), + )?; + + let avg_model_loss_on_test = stream.memcpy_dtov(&loss)?[0]; + + // Calculate baseline error epsilon_star_squared + let alpha = 4.0 - self.difficulty.accuracy_factor as f32 / 1000.0; + + let y_h = stream.memcpy_dtov(&self.dataset.test_targets_noisy())?; + let f_h = stream.memcpy_dtov(&self.dataset.test_targets_true_f())?; + stream.synchronize()?; + + let sum_sq_diff_true_vs_noisy: f32 = y_h + .iter() + .zip(f_h.iter()) + .map(|(y, f)| (*y - *f).powi(2)) + .sum(); + + let epsilon_star_squared = + (alpha / self.dataset.test_size as f32) * sum_sq_diff_true_vs_noisy; + + if avg_model_loss_on_test <= epsilon_star_squared { + Ok(()) + } else { + Err(anyhow!( + "Model test loss ({:.4e}) exceeds target baseline epsilon_star_squared ({:.4e})", + avg_model_loss_on_test, + epsilon_star_squared + )) + } + } + + pub fn layer_dims(&self) -> Vec { + let mut layer_dims = vec![self.hidden_layers_dims; self.difficulty.num_hidden_layers]; + layer_dims.insert(0, self.dataset.input_dims); + layer_dims.push(self.dataset.output_dims); + layer_dims + } +} + +pub trait CudaOptimizerState: Any + Send + Sync { + fn as_any(&self) -> &dyn Any; + fn as_any_mut(&mut self) -> &mut dyn Any; + fn box_clone(&self) -> Box; +} + +impl Clone for Box { + fn clone(&self) -> Self { + self.box_clone() + } +} + +/// Function type for initializing optimizer state +pub type CudaOptimizerInitStateFn = fn( + seed: &[u8; 32], + param_sizes: &[usize], // Sizes of all parameter tensors + stream: Arc, +) -> Result>; + +/// Function type for querying optimizer at specific parameters (like parameter prediction) +pub type CudaOptimizerQueryAtParamsFn = fn( + optimizer_state: &dyn CudaOptimizerState, + model_params: &[CudaSlice], + gradients: Option<&[CudaSlice]>, + epoch: usize, + train_loss: Option, + val_loss: Option, + stream: Arc, + module: Arc, +) -> Result>>>; + +/// Function type for optimizer step (computes parameter updates) +pub type CudaOptimizerStepFn = fn( + optimizer_state: &mut dyn CudaOptimizerState, + gradients: &[CudaSlice], + epoch: usize, + train_loss: Option, + val_loss: Option, + stream: Arc, + module: Arc, +) -> Result>>; + +pub fn training_loop( + challenge: &Challenge, + module: Arc, + stream: Arc, + prop: &cudaDeviceProp, + optimizer_init_state: CudaOptimizerInitStateFn, + optimizer_query_at_params: CudaOptimizerQueryAtParamsFn, + optimizer_step: CudaOptimizerStepFn, +) -> Result<(Solution, Vec, Vec)> { + let Challenge { + batch_size, + max_epochs, + min_loss_delta, + patience, + dataset: + Dataset { + train_size, + validation_size, + input_dims, + output_dims, + .. + }, + .. + } = *challenge; + let cublas = CudaBlas::new(stream.clone())?; + let cudnn = Cudnn::new(stream.clone())?; + + let mut model = MLP::new( + &challenge.layer_dims(), + challenge.num_frozen_layers, + stream.clone(), + )?; + model.init_weights(&challenge.seed, stream.clone(), module.clone())?; + + // Initialize optimizer + let param_sizes = model.get_parameter_sizes(); + let mut optimizer_state = optimizer_init_state( + challenge.seed, + ¶m_sizes, + stream.clone(), + module.clone(), + prop, + )?; + + let mut lowest_loss = f32::INFINITY; + let mut _best_epoch = 0; + let mut epochs_no_improvement = 0; + let mut best_model_solution: Option = None; + let mut prev_train_loss = None; + let mut prev_validation_loss = None; + let mut train_losses = Vec::with_capacity(max_epochs); + let mut validation_losses = Vec::with_capacity(max_epochs); + + let num_train_batches = (train_size + batch_size - 1) / batch_size; + let num_val_batches = (validation_size + batch_size - 1) / batch_size; + + // Initialize RNG for shuffling + let mut rng = StdRng::from_seed(challenge.seed); + + // Copy training data to host for shuffled batch creation + let train_inputs = stream.memcpy_dtov(&challenge.dataset.train_inputs())?; + let train_targets = stream.memcpy_dtov(&challenge.dataset.train_targets_noisy())?; + let validation_inputs_d = challenge.dataset.validation_inputs(); + let validation_targets_d = challenge.dataset.validation_targets_noisy(); + stream.synchronize()?; + + for epoch in 0..max_epochs { + // --- Shuffle training data indices each epoch --- + let mut train_indices: Vec = (0..train_size).collect(); + train_indices.shuffle(&mut rng); + + // --- Training Phase --- + let mut epoch_train_loss_sum = 0.0; + for i in 0..num_train_batches { + let batch_start_idx = i * batch_size; + let current_batch_size = (train_size - batch_start_idx).min(batch_size); + if current_batch_size == 0 { + continue; + } + + model.zero_grad(stream.clone(), module.clone())?; + + // Create shuffled batch data + let mut input_batch_data = vec![0.0f32; current_batch_size * input_dims]; + let mut target_batch_data = vec![0.0f32; current_batch_size * output_dims]; + + // Gather shuffled batch data + for batch_offset in 0..current_batch_size { + let shuffled_sample_idx = train_indices[batch_start_idx + batch_offset]; + + // Copy input data for this sample + let input_start = shuffled_sample_idx * input_dims; + let batch_input_start = batch_offset * input_dims; + for d in 0..input_dims { + input_batch_data[batch_input_start + d] = train_inputs[input_start + d]; + } + + // Copy target data for this sample + let target_start = shuffled_sample_idx * output_dims; + let batch_target_start = batch_offset * output_dims; + for d in 0..output_dims { + target_batch_data[batch_target_start + d] = train_targets[target_start + d]; + } + } + + // Upload shuffled batch to GPU + let mut d_input_batch = stream.alloc_zeros::(current_batch_size * input_dims)?; + let mut d_target_batch = stream.alloc_zeros::(current_batch_size * output_dims)?; + stream.memcpy_htod(&input_batch_data, &mut d_input_batch)?; + stream.memcpy_htod(&target_batch_data, &mut d_target_batch)?; + + // Query optimizer for parameter modifications before forward pass + let model_params = model.extract_parameters(stream.clone())?; + let original_params = if let Some(modified_params) = optimizer_query_at_params( + optimizer_state.as_ref(), + &model_params, + None, + epoch, + prev_train_loss, + prev_validation_loss, + stream.clone(), + module.clone(), + prop, + )? { + let backup = model_params.clone(); + model.set_parameters(&modified_params, stream.clone(), module.clone())?; + Some(backup) + } else { + None + }; + + let (output, caches) = model.forward( + &d_input_batch, + true, + stream.clone(), + module.clone(), + &cublas, + &cudnn, + )?; + let (loss, grad) = model.loss_and_grad( + &output, + &d_target_batch.as_view(), + stream.clone(), + module.clone(), + )?; + model.backward( + &grad, + &caches, + false, + stream.clone(), + module.clone(), + &cublas, + &cudnn, + )?; + + // Restore original parameters if they were modified + if let Some(params_to_restore) = original_params { + model.set_parameters(¶ms_to_restore, stream.clone(), module.clone())?; + } + + // Get gradients and apply optimizer step + let gradients = model.extract_gradients(stream.clone())?; + let param_updates = optimizer_step( + optimizer_state.as_mut(), + &gradients, + epoch, + prev_train_loss, + prev_validation_loss, + stream.clone(), + module.clone(), + prop, + )?; + + model.apply_optimizer_updates(¶m_updates, stream.clone(), module.clone())?; + + let batch_loss = stream.memcpy_dtov(&loss)?[0]; + epoch_train_loss_sum += batch_loss * current_batch_size as f32; + } + stream.synchronize()?; + + let avg_train_loss = epoch_train_loss_sum / train_size as f32; + prev_train_loss = Some(avg_train_loss); + train_losses.push(avg_train_loss); + + // --- Validation Phase --- + let mut epoch_val_loss_sum = 0.0; + if validation_size > 0 { + for i in 0..num_val_batches { + let batch_start = i * batch_size; + let current_batch_size = (validation_size - batch_start).min(batch_size); + if current_batch_size == 0 { + continue; + } + + let d_input_batch = validation_inputs_d.slice( + batch_start * input_dims..(batch_start + current_batch_size) * input_dims, + ); + let d_target_batch = validation_targets_d.slice( + batch_start * output_dims..(batch_start + current_batch_size) * output_dims, + ); + + let (output, _) = model.forward( + &d_input_batch, + false, + stream.clone(), + module.clone(), + &cublas, + &cudnn, + )?; + let (loss, _) = model.loss_and_grad( + &output, + &d_target_batch, + stream.clone(), + module.clone(), + )?; + + let mut batch_loss_h = vec![0.0; 1]; + stream.memcpy_dtoh(&loss, &mut batch_loss_h)?; + epoch_val_loss_sum += batch_loss_h[0] * current_batch_size as f32; + } + } + stream.synchronize()?; + + let avg_val_loss = if validation_size > 0 { + epoch_val_loss_sum / validation_size as f32 + } else { + avg_train_loss + }; + prev_validation_loss = Some(avg_val_loss); + validation_losses.push(avg_val_loss); + + // --- Early Stopping --- + if avg_val_loss < lowest_loss - min_loss_delta { + lowest_loss = avg_val_loss; + _best_epoch = epoch; + best_model_solution = Some(to_solution(&model, epoch + 1, stream.clone())?); + epochs_no_improvement = 0; + } else { + epochs_no_improvement += 1; + if epochs_no_improvement >= patience { + break; + } + } + } + + stream.synchronize()?; + + let solution = best_model_solution.ok_or_else(|| anyhow!("No valid solution found during training. Validation loss may have been NaN or never improved."))?; + Ok((solution, train_losses, validation_losses)) +} + +pub fn load_solution(mlp: &mut MLP, solution: &Solution, stream: Arc) -> Result<()> { + for (i, layer) in mlp.lin.iter_mut().enumerate() { + let w_flat: Vec = solution.weights[i].iter().flatten().cloned().collect(); + stream.memcpy_htod(&w_flat, &mut layer.weight)?; + + stream.memcpy_htod(&solution.biases[i], &mut layer.bias)?; + } + for (i, bn) in mlp.bns.iter_mut().enumerate() { + stream.memcpy_htod(&solution.bn_weights[i], &mut bn.weight)?; + stream.memcpy_htod(&solution.bn_biases[i], &mut bn.bias)?; + stream.memcpy_htod(&solution.bn_running_means[i], &mut bn.running_mean)?; + stream.memcpy_htod(&solution.bn_running_vars[i], &mut bn.running_var)?; + } + stream.synchronize()?; + Ok(()) +} + +pub fn to_solution(mlp: &MLP, epochs_used: usize, stream: Arc) -> Result { + stream.synchronize()?; + let mut weights = Vec::new(); + let mut biases = Vec::new(); + for layer in &mlp.lin { + let mut w_h = vec![0.0; layer.weight.len()]; + stream.memcpy_dtoh(&layer.weight, &mut w_h)?; + let mut b_h = vec![0.0; layer.bias.len()]; + stream.memcpy_dtoh(&layer.bias, &mut b_h)?; + + weights.push(w_h.chunks(layer.in_features).map(|c| c.to_vec()).collect()); + biases.push(b_h); + } + + let mut bn_weights = Vec::new(); + let mut bn_biases = Vec::new(); + let mut bn_running_means = Vec::new(); + let mut bn_running_vars = Vec::new(); + + for bn in &mlp.bns { + bn_weights.push(stream.memcpy_dtov(&bn.weight)?); + bn_biases.push(stream.memcpy_dtov(&bn.bias)?); + bn_running_means.push(stream.memcpy_dtov(&bn.running_mean)?); + bn_running_vars.push(stream.memcpy_dtov(&bn.running_var)?); + } + + Ok(Solution { + weights, + biases, + epochs_used, + bn_weights, + bn_biases, + bn_running_means, + bn_running_vars, + }) +} diff --git a/tig-runtime/Cargo.toml b/tig-runtime/Cargo.toml index 3802846..341d9e3 100644 --- a/tig-runtime/Cargo.toml +++ b/tig-runtime/Cargo.toml @@ -32,3 +32,5 @@ c004 = ["cuda", "tig-challenges/c004"] vector_search = ["c004"] c005 = ["cuda", "tig-challenges/c005"] hypergraph = ["c005"] +c006 = ["cuda", "tig-challenges/c006"] +nn_training = ["c006"] diff --git a/tig-runtime/src/main.rs b/tig-runtime/src/main.rs index 22ea326..533cd2e 100644 --- a/tig-runtime/src/main.rs +++ b/tig-runtime/src/main.rs @@ -296,6 +296,12 @@ pub fn compute_solution( #[cfg(feature = "c005")] dispatch_challenge!(c005, gpu) } + "c006" => { + #[cfg(not(feature = "c006"))] + panic!("tig-runtime was not compiled with '--features c006'"); + #[cfg(feature = "c006")] + dispatch_challenge!(c006, gpu) + } _ => panic!("Unsupported challenge"), } }; diff --git a/tig-verifier/Cargo.toml b/tig-verifier/Cargo.toml index c2f19c3..684ed13 100644 --- a/tig-verifier/Cargo.toml +++ b/tig-verifier/Cargo.toml @@ -30,3 +30,5 @@ c004 = ["cuda", "tig-challenges/c004"] vector_search = ["c004"] c005 = ["cuda", "tig-challenges/c005"] hypergraph = ["c005"] +c006 = ["cuda", "tig-challenges/c006"] +nn_training = ["c006"] diff --git a/tig-verifier/src/main.rs b/tig-verifier/src/main.rs index 3ed9154..5bf2867 100644 --- a/tig-verifier/src/main.rs +++ b/tig-verifier/src/main.rs @@ -187,6 +187,12 @@ pub fn verify_solution( #[cfg(feature = "c005")] dispatch_challenge!(c005, gpu) } + "c006" => { + #[cfg(not(feature = "c006"))] + panic!("tig-verifier was not compiled with '--features c006'"); + #[cfg(feature = "c006")] + dispatch_challenge!(c006, gpu) + } _ => panic!("Unsupported challenge"), }