// Defaults ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ #let dark-black = rgb( 0, 0, 0) #let lite-black = rgb( 16, 16, 16) #let dark-grey = rgb( 64, 64, 64) #let mid-grey = rgb(128, 128, 128) #let lite-grey = rgb(192, 192, 192) #let dark-white = rgb(240, 240, 240) #let blue-white = rgb(240, 248, 255) #let lite-white = rgb(255, 255, 255) #let pure-red = rgb(224, 16, 0) #let dark-red = rgb(255, 64, 16) #let lite-red = rgb(255, 80, 0) #let dark-blue = rgb( 0, 160, 224) #let mid-blue = rgb( 48, 176, 255) // or (0, 160, 224) or (80, 192, 255) or (80, 160, 255) #let lite-blue = rgb( 80, 192, 255) #let dark-orange = rgb(208, 128, 48) // or rgb(240, 144, 64) #let lite-orange = rgb(240, 144, 64) #set page( width: 16cm, height: 9cm, margin: ( x: 0pt, y: 0pt, ) ) #set par( leading: 8pt, spacing: 16pt, justify: false, linebreaks: "optimized" ) #set text( font: "Iosevka Signalis", size: 12pt, tracking: 0.125pt, fallback: false, fill: dark-white ) #set smartquote(enabled: false) #show heading: set text( font: "Roadgeek 2014 Series D", size: 18pt, spacing: 66.67%, fill: dark-red ) #show heading: set block( below: 15pt ) #show raw: set text( font: "Iosevka Signalis Mono", tracking: 0pt ) #show raw.where(block: false): set text(1.00em / 0.8) #show raw.where(block: true ): set text(0.75em / 0.8) #show raw.where(block: true ): set par(leading: 5pt) #set raw( theme: "signalis.tmTheme" ) #show smallcaps: sc => text( font: "Iosevka Signalis Md Ex", size: 0.75em, tracking: 0.0625em, spacing: 100% - 0.0625em, upper(sc) ) #let flipped(s) = { s.clusters().map(c => box(scale(x: -100%, c))).join() } #show raw.where(lang: "demo"): set text(1.0em / 0.8) #show raw.where(lang: "demo"): r => { show regex("@b\d"): it => text(dark-blue, it.text.slice(2)) show regex("@r\d"): it => text(lite-red, it.text.slice(2)) r } #set math.mat(delim: "[") #set math.vec(delim: "[") #show math.equation: set text(font: "Fira Math") // Blocks ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ #let slide(stroke: dark-red, body) = page( fill: lite-black, background: image("img/background.svg") )[ #box( width: 100%, height: 100%, stroke: ( top: 10pt + stroke, rest: 1pt + stroke, ), outset: ( top: -5.0pt, rest: -0.5pt, ), inset: ( top: 20pt, rest: 12pt, ), body ) ] #let rubox(body) = box[ #box( stroke: ( top: 10pt + dark-red, rest: 1pt + dark-red, ), outset: ( top: -5.0pt, rest: -0.5pt, ), inset: ( top: 16pt, bottom: 8pt, x: 8pt, ), body ) ] #let fill-left-label-box( background: lite-white, foreground: dark-black, width: auto, heading, body ) = layout(size => [ #block( width: if width == auto { measure( width: size.width, block(inset: (x: 8pt, y: 8pt), body) ).width } else { width }, below: 0pt, fill: background, outset: ( x: 0pt, y: 0pt, ), inset: ( x: 2pt, y: 2pt, ), text( font: "Roadgeek 2014 Series F", size: 10pt, tracking: -0.25pt, spacing: 50%, fill: foreground, heading ) ) #v(-1pt) #block( width: width, above: 0pt, stroke: 1pt + background, outset: ( x: -0.5pt, y: -0.5pt, ), inset: ( x: 8pt, y: 8pt, ), body ) ]) #let fill-ctr-label-box( background: lite-white, foreground: dark-black, width: auto, heading, body ) = layout(size => [ #block( width: if width == auto { measure( width: size.width, block(inset: (x: 8pt, y: 8pt), body) ).width } else { width }, below: 2pt, fill: background, outset: ( x: 0pt, y: 0pt, ), inset: ( x: 2pt, y: 3pt, ), align( center, text( font: "Roadgeek 2014 Series F", size: 10pt, tracking: -0.25pt, spacing: 50%, fill: foreground, heading ) ) ) #block( width: width, above: 2pt, stroke: 1pt + background, outset: ( x: -0.5pt, y: -0.5pt, ), inset: ( x: 8pt, y: 8pt, ), body ) ]) #let outline-ctr-label-box( background: lite-white, width: auto, heading, body ) = layout(size => [ #block( width: if width == auto { measure( width: size.width, block(inset: (x: 8pt, y: 8pt), body) ).width } else { width }, below: 2pt, stroke: 1pt + background, outset: ( x: -0.5pt, y: -0.5pt, ), inset: ( x: 2pt, y: 4pt, ), align( center, text( font: "Roadgeek 2014 Series F", size: 10pt, tracking: -0.25pt, spacing: 50%, fill: background, heading ) ) ) #block( width: width, above: 2pt, stroke: 1pt + background, outset: ( x: -0.5pt, y: -0.5pt, ), inset: ( x: 8pt, y: 8pt, ), body ) ]) #let ofllbox(width: auto, heading, body) = fill-left-label-box( background: dark-orange, foreground: lite-black, width: width, heading, body ) #let wfllbox(width: auto, heading, body) = fill-left-label-box( background: dark-white, foreground: dark-black, width: width, heading, body ) #let rfllbox(width: auto, heading, body) = fill-left-label-box( background: dark-red, foreground: lite-black, width: width, heading, body ) #let wfclbox(width: auto, heading, body) = fill-ctr-label-box( background: dark-white, foreground: dark-black, width: width, heading, body ) #let rfclbox(width: auto, heading, body) = fill-ctr-label-box( background: dark-red, foreground: dark-black, width: width, heading, body ) #let woclbox(width: auto, heading, body) = outline-ctr-label-box( background: dark-white, width: width, heading, body ) #let roclbox(width: auto, heading, body) = outline-ctr-label-box( background: dark-red, width: width, heading, body ) #let rlabel(width: auto, body) = box( width: width, stroke: 1pt + dark-red, outset: ( x: -0.5pt, y: 1.0pt, ), inset: ( x: 2.5pt, y: 1.5pt, ), text( font: "Roadgeek 2014 Series F", size: 10pt, tracking: -0.25pt, spacing: 50%, fill: dark-red, body ) ) #let wlabel(width: auto, body) = box( width: width, fill: dark-white, outset: ( x: 0.0pt, y: 1.5pt, ), inset: ( x: 2.5pt, y: 1.5pt, ), text( font: "Roadgeek 2014 Series F", size: 10pt, tracking: -0.25pt, spacing: 50%, fill: dark-black, body ) ) #let note(body) = text(fill: mid-grey, body) #let term(body) = text(fill: mid-blue, body) #let hi(body) = text(fill: lite-red, body) #let rust(body) = text(fill: lite-orange, body) #let warning = text(fill: lite-red)[ \[#text( font: "Roadgeek 2014 Series C", size: 1.25em, "ACHTUNG" )\] ] #set list(marker: term[›], spacing: 8pt) #show terms: it => { let max-width = 0em for child in it.children { if max-width < measure(child.term).width { max-width = measure(child.term).width } } for child in it.children { grid( columns: 2, // gutter: 8pt, box( width: max-width + 8pt, fill: dark-white, outset: (y: 1.5pt), inset: (y: 1.5pt), align( center, text( font: "Roadgeek 2014 Series F", size: 10pt, tracking: -0.25pt, spacing: 50%, fill: dark-black, child.term ) ) ), box( stroke: (left: 1pt + dark-white), outset: (y: 1.5pt, left: 0.5pt), inset: (left: 6pt), child.description ) ) } } #let fringe(body) = [ #place(horizon+center, dx: -1.6pt)[#text(fill: rgb( 16, 16, 255))[#body]] #place(horizon+center, dx: +1.6pt)[#text(fill: rgb(240, 0, 0))[#body]] #place(horizon+center, dx: -1.2pt)[#text(fill: rgb( 16, 16, 255))[#body]] #place(horizon+center, dx: +1.2pt)[#text(fill: rgb(240, 0, 0))[#body]] #place(horizon+center, dx: -0.8pt)[#text(fill: rgb( 0, 224, 255))[#body]] #place(horizon+center, dx: +0.8pt)[#text(fill: rgb(255, 240, 0))[#body]] #place(horizon+center, dx: -0.4pt)[#text(fill: rgb( 0, 224, 255))[#body]] #place(horizon+center, dx: +0.4pt)[#text(fill: rgb(255, 240, 0))[#body]] #place(horizon+center, )[#text(fill: rgb(255, 255, 255))[#body]] ] // Slides ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ #page(fill: pure-red)[ #set par(leading: 5pt, spacing: 0pt) #set text( font: "Univers LT Std", size: 18pt, weight: 400, stretch: 50%, tracking: 0.5pt, fill: lite-white ) #align(horizon + center)[ #text(font: "Noto Serif CJK SC", size: 10pt, weight: 600, lang: "zh")[〇] #v(12pt) #text(size: 32pt)[ { #text(font: "HanWangMingBlack", size: 28pt, weight: 700, lang: "zh")[零] } ] #v(18pt) THIS SLIDE \ INTENTIONALLY \ LEFT BLANK. ] ] // Title - - - - - - - - - - - - - - - - #page( fill: lite-black, background: image("img/background.svg") )[ #set par(leading: 8pt, spacing: 8pt) #align(horizon + center)[ #v(24pt) #box( stroke: ( top: 1pt + lite-white, bottom: 1pt + lite-white, rest: none ), outset: ( x: 0pt, y: 12pt, ) )[ #set par(leading: 10pt) #text( font: "Roadgeek 2014 Series B", size: 36pt, fill: blue-white, )[ WRITING A TOP-10 \ CHESS ENGINE IN RUST ] ] #v(31pt) #stack(dir: ltr, spacing: 48pt)[ #image("img/csmr-large.png", height: 17pt) ][ #image("img/krar-large.png", height: 17pt) ] ] ] // Authors - - - - - - - - - - - - - - - #slide[ = UNIT DESIGNATIONS #align( horizon + center, block(align(left + top)[ / COSMO: Author of Viridithas (#smallcaps[spcc] rank 9) \ #note[https://cosmo.tardis.ac] / KORA: Author of Expositor (_defunct_)\ #note[https://typ.dev] ]) ) ] // Content - - - - - - - - - - - - - - - #slide[ = PRELIMINARY DEFINITIONS A #smallcaps[chess engine] is a program that, given the state of a game of chess, attempts to both evaluäte that state and recommend a sequence of moves. #let sq_size = 1.7em #let r(x) = text(fill: dark-red, x) #let b(x) = text(fill: dark-blue, x) #let f(x) = text(fill: dark-black, x) #let l(x) = text(fill: rgb(110, 110, 110), x) #let os = 1.7em - 1pt #let box-l(x) = box(stroke: 1pt + mid-grey, height: os, width: os, x) #let box-d(x) = box(stroke: 1pt + lite-grey, height: os, width: os, x) #align( center + horizon, grid( columns: 2, gutter: 16pt, align(center + horizon)[ #set text(1em * 0.75, font: "Iosevka Signalis Mono") #grid( inset: (x: 3pt, y: 3pt), columns: (sq_size,) * 8, rows: sq_size, align: center + horizon, fill: (x, y) => { let c = if calc.odd(x + y) { "#131313" } else { "#303030" } rgb(c) }, stroke: (x, y) => { if x == 0 { ( left: 0.7pt + dark-black) } if x == 7 { ( right: 0.7pt + dark-black) } if y == 0 { ( top: 0.7pt + dark-black) } if y == 7 { (bottom: 0.7pt + dark-black) } }, // @typstyle off ..( b("R"),b(" "),b("B"),b("Q"),b(" "),b("B"),f(" "),b("R"), b("P"),b("P"),b("P"),b(" "),b(" "),b("K"),b("P"),b("P"), f(" "),b(" "),b("N"),b(" "),f(" "),b(" "),b(" "),b(" "), b(" "),f(" "),b(" "),b("N"),b("P"),b(" "),b(" "),b(" "), r(" "),r(" "),r("B"),r(" "),r(" "),r(" "),r(" "),r(" "), r(" "),f(" "),r(" "),f(" "),r(" "),r("Q"),r(" "),r(" "), r("P"),r("P"),r("P"),r("P"),f(" "),r("P"),r("P"),r("P"), r("R"),r("N"),r("B"),r(" "),r("K"),f(" "),r(" "),r("R"), ) ) ], align(center + horizon)[ ⟨#h(2pt)King to e6, #r[+0.44] in White’s favor#h(2pt)⟩ ], ) ) ] #slide[ = THE COMPONENTS OF AN ENGINE #v(8pt) / STATE: How is the chessboard represented in the program? / SEARCH: Constructing and pruning trees of future possibilities_!_ / EVALUATION: How do we estimate the value of a chess position? ] #slide[ = STATE REPRESENTATION One must be able to - represent the present state of the game - query legal moves - apply transitions caused by moves - determine when & how a game has ended #v(8pt) #note[This list is highly non-exhaustive.] ] #slide[ = TYPES We use #term[enum]s to represent colors, pieces, ranks, files... #box(inset: (left: 12pt))[ ```rust #[derive(...)] #[repr(u8)] enum PieceType { Pawn = 0, Knight = 1, Bishop = 2, Rook = 3, Queen = 4, King = 5, } ``` ] ] #slide[ = TYPES We use #term[enum]s to represent colors, pieces, ranks, files... By doing this, we benefit manifold: / RECTITUDE: Precisely specifying the valid values for a type eliminates swathes of bugs. ```rust Rank::Third != File::C != PieceType::Knight ``` / CONCISION: Methods on enums are a joy to use. ```rust Square::F2.relative_to(Color::Black) == Square::F7 ``` / ALACRITY: The more constraints we have on our data, the more we can optimise. ] #slide[ = INDEXING Suppose we have a type for White _vs_ Black like so: #align(center)[ ```rust enum Color { White = 0, Black = 1, } ``` ] ] #slide[ = INDEXING Suppose we have a type for White _vs_ Black like so: #align(center)[ ```rust enum Color { White = 0, Black = 1, } ``` ] // pause There are many places where we have something like: #align(center)[ ```rust // occupancy : [u64; 2] // side : Color occupancy[side as usize] ``` ] ] #slide[ = INDEXING Typing out #term[usize] every time is annoying, and we’d like to avoid bounds-checks. Fortunately, we can implement the trait #term[std`::`ops`::`Index]#h(1pt)_!_ #box(inset: (left: 12pt))[ ```rust impl std::ops::Index for [T; 2] { type Output = T; fn index(&self, idx : Color) -> &Self::Output { unsafe { self.get_unchecked(idx as usize) } } } ``` ] With this, we can now write “occupancy[side]” without worrying \ about bounds-checks or #term[usize] casts. \ #note[#emph[Typically, LLVM can be relied upon to elide bounds-checks. \ Avoid this pattern unless profiling reveals that elision has failed.]] ] #slide[ = BITBOARDS #columns(2, gutter: 0%, [ #term[Bitboards] are unsigned 64-bit integers that represent _sets of squares_ on the chessboard. #v(8pt) #align(center)[\{A1, C1, D1\}#h(0.5em)⇌ #h(0.5em) .#h(1pt).#h(1pt).#h(1pt)00001101] #v(8pt) Conveniently, 64-bits is the size of the machine word on modern hardware. // todo(cosmo) above line can be repatterned as // set : word // board : cache-line #colbreak() #v(-8pt) #align(center)[ #set text(font: "Iosevka Signalis Mono") #let sq_size = 1.7em #grid( inset: (x: 3pt, y: 3pt), columns: (sq_size,) * 8, rows: sq_size, align: center + horizon, fill: (x, y) => rgb( if calc.odd(x + y) { "#131313" } else { "#303030" }, ), stroke: (x, y) => { if x == 0 { (left: 0.4pt + black) } if x == 7 { (right: 0.4pt + black) } if y == 0 { (top: 0.4pt + black) } if y == 7 { (bottom: 0.4pt + black) } }, // @typstyle off ..( 56,57,58,59,60,61,62,63, 48,49,50,51,52,53,54,55, 40,41,42,43,44,45,46,47, 32,33,34,35,36,37,38,39, 24,25,26,27,28,29,30,31, 16,17,18,19,20,21,22,23, 08,09,10,11,12,13,14,15, 00,01,02,03,04,05,06,07, ).map(it => if str(it).len() == 1 [0] else [ ] + str(it)) ) ] ]) ] #slide[ = BITBOARDS Typically one stores a chess position using 8 bitboards. #box(inset: (left: 12pt))[ ```rust struct Position { sides : [Bitboard; 2], // occupancy of each side pieces : [Bitboard; 6], // occupancy of each piece-type } ``` ] ] #slide[ = BITBOARDS Typically one stores a chess position using 8 bitboards. #box(inset: (left: 12pt))[ ```rust struct Position { sides : [Bitboard; 2], // occupancy of each side pieces : [Bitboard; 6], // occupancy of each piece-type } ``` ] // pause Then a query like “how many unblocked pawns are there” is merely: #box(inset: (left: 12pt))[ ```rust let pawns = sides[Color::White] & pieces[PieceType::Pawn]; let movable_pawns = pawns & !sides[Color::Black].south_one(); return pawns.count_ones(); ``` ] // What might be a pair of nested loops in a naïve implementation is just // three array accesses, a handful of bitwise operations, and popcnt. And // that might bring us on to popcnt! ] #slide[ = ITERATION Given a bitboard (a square-set), how would you efficiently loop over the squares present in the set? ] #slide[ = BIT MANIPULATION // https://tearth.dev/posts/performance-of-bit-manipulation-instructions-bmi/ / TZCNT: Count the number of trailing zeroes. / BLSR: Clear the least-significant set bit. / POPCOUNT: Count the number of set bits. #box(inset: (left: 12pt))[ ```demo tzcnt(101101@r0@r0) == @r2 blsr(10110@r100) == 0b10110@r000 popcount(@r10@r1@r10@r100) == @r4 ``` ] #smallcaps[TZCNT] and #smallcaps[BLSR] can be paired to form efficient loops over set bits. ] #slide[ = BIT MANIPULATION #smallcaps[TZCNT] and #smallcaps[BLSR] can be paired to form efficient loops over set bits. #box(inset: (left: 12pt))[ ```rust impl Iterator for SquareIter { fn next(&mut self) -> Option { if self.value == 0 { None } else { let lsb = self.value.trailing_zeros() as u8; // TZCNT self.value &= self.value - 1; // BLSR Square::new(lsb) } } } ``` ] ] #slide[ = BIT MANIPULATION Abstraction in hand, we can loop over square-sets with maximum efficiency_!_ #box(inset: (left: 12pt))[ ```rust let our_knights = board.pieces[Knight] & board.sides[side]; for src in our_knights { let moves = knight_attacks(src); for tgt in moves & !blockers { moves.push(Move::new(src, tgt)); } } ``` ] ] #slide[ = EVALUÄTION Modern chess engines use Efficiently Updatable Neural Networks #box[#move(dy: 1pt, "(")]#term[#smallcaps[#flipped("NNUE")]]#box[#move(dy: 1pt, ")")]. The input features of the network are entirely binary, therefore whenever a piece moves, we simply add or subtract columns of the embedding matrix to generate a new hidden state from an old one. #let colr(x) = text(fill: lite-red, $#x$) $ mat( dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c; dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c; dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c; dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c, dots.h.c; ) space dot underbrace(vec(0, 1, 0, 0, 1, 0), "BOARD") = vec(a, b, c, d) space space space -> space space space mat( dots.h.c, dots.h.c, colr(e_02), dots.h.c, dots.h.c, dots.h.c; dots.h.c, dots.h.c, colr(e_12), dots.h.c, dots.h.c, dots.h.c; dots.h.c, dots.h.c, colr(e_22), dots.h.c, dots.h.c, dots.h.c; dots.h.c, dots.h.c, colr(e_32), dots.h.c, dots.h.c, dots.h.c; ) space dot underbrace(vec(0, 1, colr(1), 0, 1, 0), "BOARD") = vec(a, b, c, d) + vec(colr(e_02), colr(e_12), colr(e_22), colr(e_32)) $ ] #slide[ = OPTIMIZATION OF INFERENCE The most expensive part of the engine is a 1024 → 32 fully-connected layer. $ underbrace( mat( arrow.tl, arrow.t, arrow.tr; arrow.l, 32 × 1024, arrow.r; arrow.bl, arrow.b, arrow.br; ), "WEIGHTS" ) space dot underbrace(vec(arrow.t, , 1024, , arrow.b), "INPUT") = underbrace(vec(arrow.t, 32, arrow.b), "OUTPUT") $ ] #slide[ = OPTIMIZATION OF INFERENCE The most expensive part of the engine is a 1024 → 32 fully-connected layer. $ underbrace( mat( arrow.tl, arrow.t, arrow.tr; arrow.l, 32 × 1024, arrow.r; arrow.bl, arrow.b, arrow.br; ), "WEIGHTS" ) space dot underbrace(vec(arrow.t, , 1024, , arrow.b), "INPUT") = underbrace(vec(arrow.t, 32, arrow.b), "OUTPUT") $ / QUANTIZATION: We compress the weights and inputs to `i8`. ] #slide[ = OPTIMIZATION OF INFERENCE The most expensive part of the engine is a 1024 → 32 fully-connected layer. $ underbrace( mat( arrow.tl, arrow.t, arrow.tr; arrow.l, 32 × 1024, arrow.r; arrow.bl, arrow.b, arrow.br; ), "WEIGHTS" ) space dot underbrace(vec(arrow.t, , 1024, , arrow.b), "INPUT") = underbrace(vec(arrow.t, 32, arrow.b), "OUTPUT") $ / QUANTIZATION: We compress the weights and inputs to `i8`. / SPARSITY: We train the input activations to be mostly zero. ] #slide[ = OPTIMIZATION OF INFERENCE The most expensive part of the engine is a 1024 → 32 fully-connected layer. $ underbrace( mat( arrow.tl, arrow.t, arrow.tr; arrow.l, 32 × 1024, arrow.r; arrow.bl, arrow.b, arrow.br; ), "WEIGHTS" ) space dot underbrace(vec(arrow.t, , 1024, , arrow.b), "INPUT") = underbrace(vec(arrow.t, 32, arrow.b), "OUTPUT") $ // pause / QUANTIZATION: We compress the weights and inputs to `i8`. // pause / SPARSITY: We train the input activations to be mostly zero. // pause / SIMD: We find and index the non-zero activations in parallel. ] #slide[ = DEDUPLICATION Many copies of an engine may live and play simultaneously inside the same machine. Reading so many copies of the network causes cache pressure, \ so we map the same copy of the weights into all processes. ] #slide[ = DEDUPLICATION Many copies of an engine may live and play simultaneously inside the same machine. Reading so many copies of the network causes cache pressure, \ so we map the same copy of the weights into all processes. // pause #box(inset: (left: 12pt))[ ```rust fn weights() -> &'static Network { static MAP : OnceLock = OnceLock::new(); MAP.get_or_init(|| { if !network_path.exists() { decompress_network_into(&network_path) } unsafe { Mmap::map(network_path) } }) // transmutation and inter-process coördination elided } ``` ] ] #slide[ = THE SAFE ELIMINATION OF BOUNDS CHECKS #align( center, wfclbox("TASK")[Sort a list of moves by quality, best-to-worst.] ) #align( center, ```rust moves.sort_by_key(|m| { (exchange_winning(m), history(m)) // (bool, i32) }); ``` ) ] #slide[ = THE SAFE ELIMINATION OF BOUNDS CHECKS #align( center, wfclbox("TASK")[Sort a list of moves by quality, best-to-worst.] ) #align( center, ```rust moves.sort_by_key(|m| { (exchange_winning(m), history(m)) // (bool, i32) }); ``` ) #v(16pt) / REFINEMENT A: We usually only need the first 1–2 elements of the sorted list. ] #slide[ = THE SAFE ELIMINATION OF BOUNDS CHECKS #align( center, wfclbox("TASK")[Sort a list of moves by quality, best-to-worst.] ) #align( center, ```rust moves.sort_by_key(|m| { (exchange_winning(m), history(m)) // (bool, i32) }); ``` ) #v(16pt) / REFINEMENT A: We usually only need the first 1–2 elements of the sorted list. / REFINEMENT B: The sorting key is dominated by exchange_winning. ] #slide[ = THE SAFE ELIMINATION OF BOUNDS CHECKS #align( center, wfclbox("TASK")[Sort a list of moves by quality, best-to-worst.] ) #align( center, ```rust moves.sort_by_key(|m| { (exchange_winning(m), history(m)) // (bool, i32) }); ``` ) // pause #v(16pt) / REFINEMENT A: We usually only need the first 1–2 elements of the sorted list. // pause / REFINEMENT B: The sorting key is dominated by exchange_winning. // pause / REFINEMENT C: Evaluating exchange_winning is very costly. ] #slide[ = THE SAFE ELIMINATION OF BOUNDS CHECKS ```rust // select the maximal element each time around the loop while let Some((idx, max)) = moves.iter() .enumerate() .max_by_key(|(_, m)| history(m)) { // max is only the max-history move prior if !exchange_winning(max) { set_bad(max); continue; } emit(max); moves.swap(0, idx); // :( moves = &mut moves[1..]; } ``` // :) ] #slide[ = THE SAFE ELIMINATION OF BOUNDS CHECKS ```rust // select the maximal element each time around the loop while let Some((idx, max)) = moves.iter() .enumerate() .max_by_key(|(_, m)| history(m)) { // max is only the max-history move prior if !exchange_winning(max) { set_bad(max); continue; } emit(max); moves.swap(0, idx); // :( moves = &mut moves[1..]; } ``` // :) // pause Frustratingly, moves.swap(0, idx) emits a bounds-check for idx. ] #slide[ = THE SAFE ELIMINATION OF BOUNDS CHECKS Ideally, we'd just write through our max reference#h(2pt)—#h(2pt)we could just mem`::`swap it with element zero. #box(inset: (left: 12pt))[ ```rust while let Some(max) = moves.iter_mut().max_by_key(history) { if !exchange_winning(max) { set_bad(max); continue; } emit(max); if done() { break; } std::mem::swap(&mut moves[0], max); // ^^^^^^^^^^^^^ fails to borrow-check moves = &mut moves[1..]; } ``` ] ] #slide[ = THE SAFE ELIMINATION OF BOUNDS CHECKS Ideally, we'd just write through our max reference#h(2pt)—#h(2pt)we could just mem`::`swap it with element zero. #box(inset: (left: 12pt))[ ```rust while let Some(max) = moves.iter_mut().max_by_key(history) { if !exchange_winning(max) { set_bad(max); continue; } emit(max); if done() { break; } std::mem::swap(&mut moves[0], max); // ^^^^^^^^^^^^^ fails to borrow-check moves = &mut moves[1..]; } ``` ] // pause This doesn’t work#h(2pt)—#h(2pt)max is uniquely borrowing moves, so “\&mut moves[0]” fails to borrow-check. ] #slide[ = CELLS _A mutable memory location._ #box(inset: (left: 12pt))[ ```rust #[repr(transparent)] pub struct Cell { value : UnsafeCell, } impl Cell { pub fn new(value : T) -> Cell; pub fn get(&self) -> T where T : Copy; pub fn set(&self, value : T); // ^^^^^ a shared borrow! } ``` ] ] #slide[ = THE SLICE-CELL TRICK ```rust // &mut [T] -> &Cell<[T]> -> &[Cell] let mut moves = Cell::from_mut(&mut moves).as_slice_of_cells(); while let Some(max) = moves.iter().max_by_key(|m| history(m.get())) { if !exchange_winning(max) { set_bad(max); continue; } emit(max); if done() { break; } Cell::swap(&moves[0], max); moves = &moves[1..]; } ``` Because we can make multiple _shared_ borrows of moves, all is well in the world, and this routine emits no bounds-checks. ] #slide[ = THE TRACK MACRO A lot goes on inside a chess engine_!_ #box(inset: (left: 12pt))[ ```rust // what’s this value on average? if cheap_heuristic(...) > 42 { // how often does this fire? if expensive_heuristic(...) { return; } } ``` ] ] #slide[ = THE TRACK MACRO With #term[macros], one can trace and aggregate statistics in an _ad-hoc_ manner. ] #slide[ = THE TRACK MACRO With #term[macros], one can trace and aggregate statistics in an _ad-hoc_ manner. // pause #box(inset: (left: 12pt))[ ```rust macro_rules! track { ($name:expr; $v:expr) => {{ #[distributed_slice(TRACKED_VALUES)] static ENTRY : TrackedValue = TrackedValue::new(const { $name }); let value = $v; ENTRY.record(value as i64); value }}; ($v:expr) => {{ track!(stringify!($v); $v) }}; } ``` ] ] #slide[ = DISTRIBUTED SLICE The #term[linkme] crate provides the \#\[distributed_slice\] macro, \ which is _incredible_. #box(inset: (left: 12pt))[ ```rust #[distributed_slice] static CONSTANTS : [i32]; #[distributed_slice(CONSTANTS)] // foo.rs static A : i32 = 45; #[distributed_slice(CONSTANTS)] // bar.rs static B : i32 = 20; #[distributed_slice(CONSTANTS)] // baz.rs static C : i32 = 12; // main.rs println!("{:?}", CONSTANTS); // -> [20, 45, 12] ``` ] ] #slide[ = THE TRACK MACRO The track#emph[!] macro behaves just like the identity function_!_ #box(inset: (left: 12pt))[ ```rust if track!(cheap_heuristic(...)) > 42 { if track!(expensive_heuristic()) { return; } } ``` ] ] #page( fill: lite-black, background: image("img/background.svg"), margin: (x: 0pt, top: 0pt, bottom: 0pt), image("img/track-macro-output.svg", width: 100%), ) #slide[ = SHARED CACHE _Or the “transposition table”, in traditional parlance._ \ #note[ (_The size of the transposition table is also known as the “hash size”,_ \ _despite the fact that the data structure is not a hash map but a cache._) ] The cache is the only means by which threads communicate during search. #grid( columns: 2, gutter: 48pt, [```rust #[repr(C)] struct CacheEntry { tag : u16, move : Option, eval : i16, score : i16, depth : u8, meta : PackedMetadata, } ```], [```rust #[repr(C, align(32))] struct CacheSet { entries : [CacheEntry; 3], padding : [u8; 2], // to avoid UB } struct Cache { sets : Vec } ```] ) ] #slide[ = SHARED CACHE #grid( columns: 2, gutter: 48pt, [```rust #[repr(C)] struct CacheEntry { tag : u16, move : Option, eval : i16, score : i16, depth : u8, meta : PackedMetadata, } ```], [```rust #[repr(C, align(32))] struct CacheSet { entries : [CacheEntry; 3], padding : [u8; 2], // to avoid UB } struct Cache { sets : Vec } ```] ) ```rust fn derive_index_tag(num_sets : u64, key : u64) -> (usize, u16) ``` ] #slide[ = ACCESSING THE CACHE We’ve two options: - Guard access with locks. \ #note[#emph[The compiler can optimize our lookup into a single 256-bit load if that’s better, but the performance penalty from obtaining a lock is unacceptable] (#emph[or at least unnecessary, however slight it may be#h(2pt)—#h(2pt)hence unacceptable]).] ] #slide[ = ACCESSING THE CACHE We’ve two options: - Guard access with locks. \ #note[#emph[The compiler can optimize our lookup into a single 256-bit load if that’s better, but the performance penalty from obtaining a lock is unacceptable] (#emph[or at least unnecessary, however slight it may be#h(2pt)—#h(2pt)hence unacceptable]).] // pause #v(0pt) - Use atomics. \ #note[#emph[Four 64-bit loads, which appears to be as fast as anything else in practice.]] ] #slide[ = ACCESSING THE CACHE ATOMICALLY #box(inset: (left: 12pt))[ ```rust pub fn lookup(&self, key : u64) -> CacheEntry { let (index, tag) = derive_index_tag(self.num_sets, key); let b0 = self.sets[index].block[0].load(Relaxed); let b1 = self.sets[index].block[1].load(Relaxed); let b2 = self.sets[index].block[2].load(Relaxed); let b3 = self.sets[index].block[3].load(Relaxed); let cache_set = unsafe { transmute::<[u64; 4], CacheSet>([b0, b1, b2, b3]) }; for entry in cache_set.entries { if entry.tag == tag { return entry; } } return CacheEntry::ZERO; } ``` ] ] #slide[ = THE DUALITY OF MEM #v(16pt) #rfclbox(width: 100%, "FEARLESS CONCURRENCY")[ Rust is a great systems language because of its powerful safe primitives #note[(like Arc, Condvar, Mutex, OnceLock, and so on)]. ] ] #slide[ = THE DUALITY OF MEM #v(16pt) #rfclbox(width: 100%, "FEARLESS CONCURRENCY")[ Rust is a great systems language because of its powerful safe primitives #note[(like Arc, Condvar, Mutex, OnceLock, and so on)]. ] // pause #v(8pt) #rfclbox(width: 100%, "FEARFUL CONCURRENCY")[ Rust is _also_ a great systems language because it takes you seriously as a user and lets you create your own primitives_!_ #note[(After all, the standard library is largely implemented in ordinary Rust.)] ] ] #slide[ #align(horizon)[ #align(center, warning) #align(center, emph[ When you use #term[unsafe], the compiler will not check that \ soundness is upheld; you are responsible for doing so. ]) ] ] #page( fill: rgb(8, 8, 8), )[ #set par(leading: 5pt, spacing: 0pt) #set text( font: "Univers LT Std", size: 18pt, weight: 400, stretch: 50%, tracking: 1pt, fill: rgb(192, 192, 192), ) #fringe[ DU HAST VERSPROCHEN ] ] #page( fill: rgb(224, 16, 0), )[ #set par(leading: 5pt, spacing: 0pt) #set text( font: "Univers LT Std", size: 18pt, weight: 400, stretch: 50%, tracking: 0.5pt, fill: rgb(255, 255, 255), ) #align(horizon + center)[ REMEMBER YOUR PROMISE ] ] #slide[ #align(center + horizon)[ #wfclbox("INJUNCTION")[READ MARA’S BOOK.] ] ] #slide[ = TORN READS The widest atomic type is AtomicU64 (eight bytes), but an entry is ten bytes and a set is thirty(-two) bytes, so neither can be accessed atomically. Reading an entry requires multiple loads, and in theory, if another thread is issuing stores at the same time, part of the data we read might be from the ongoing write and part of the data might be from a previous write. #wfllbox("NOTE")[ Even if torn reads weren’t possible, since we store too few tag bits to recover the key, we have to handle entries that appear valid but are in fact invalid. #note[(There’s also the very small chance of key collisions.)] ] ] #slide[ = TORN READS We don’t have to mitigate this, amazingly enough. - It’s safe because all bit patterns are valid. - The performance\* impact is small (but enough that it’s worth countering). #note[#emph[Why might this be so? We already expect the engine to be mistaken in its evaluätion of positions, and reading an incorrect entry is simply another misevaluätion, which merely degrades the quality of analysis but does not threaten algorithmic correctness.]] We can reject the vast majority of incorrect lookups by checking the legality of the stored move. #note[#emph[This guards against torn reads, tag aliasing, and key collisions, but imperfectly#h(2pt)—#h(2pt)a move might be legal in two colliding positions.]] #align(bottom, text(size: 8pt)[ \*#h(0.25em)#note[Game-playing strength measured in Elo rating points.] ]) ] #slide[ = ATOMIC PER BYTE RFC #strong[3301] proposes an “atomic memcpy” that permits tearing. #note[ #emph[It might also provide the means to read uninitialized memory, allowing more performant implementations of data structures like sparse sets.] ] ] #slide[ = RUST THROUGH THE AGES We both started using Rust in January 2021. What’s changed since then? #grid( columns: (3em, auto), column-gutter: 10pt, row-gutter: 7pt, align: (right, auto), text(fill: dark-blue, "1.51"), [const generics · {integer}`::`unsigned_abs], text(fill: dark-blue, "1.58"), [identifiers in format strings], text(fill: dark-blue, "1.59"), [inline assembly · destructuring assignments], text(fill: dark-blue, "1.60"), [{integer}`::`abs_diff], text(fill: dark-blue, "1.65"), [let-else statements · break in labelled blocks], text(fill: dark-blue, "1.79"), [inline const expressions], text(fill: dark-blue, "1.82"), [raw pointer syntax], text(fill: dark-blue, "1.87"), [{integer}`::`is_multiple_of · {integer}`::`midpoint], text(fill: dark-blue, "1.88"), [let chains], text(fill: dark-blue, "1.92"), [Box`::`new_zeroed], // show example text(fill: dark-blue, "1.93"), [[T]`::`as_array], ) ] // Final - - - - - - - - - - - - - - - - #page(fill: dark-black)[ // rgb(192, 192, 200) #set par(leading: 7.5pt, spacing: 0pt) #set text( font: "Univers LT Std", size: 27pt, weight: 400, stretch: 50%, tracking: 1pt, fill: lite-white, // rgb(248, 248, 255) ) #fringe[ #text(font: "Noto Serif CJK SC", size: 15pt, weight: 600, lang: "zh")[△] // or □ or ☖? #v(18pt) #text(size: 48pt)[ { #box[#move(dx: 2pt, dy: 4pt)[ #text(font: "Noto Serif CJK SC", size: 42pt, weight: 700, lang: "zh")[终] // or 完? ]] } ] #v(27pt) THIS SLIDE \ INTENTIONALLY \ LEFT BLANK. ] ]