aes/soft/
fixslice32.rs

1//! Fixsliced implementations of AES-128, AES-192 and AES-256 (32-bit)
2//! adapted from the C implementation
3//!
4//! All implementations are fully bitsliced and do not rely on any
5//! Look-Up Table (LUT).
6//!
7//! See the paper at <https://eprint.iacr.org/2020/1123.pdf> for more details.
8//!
9//! # Author (original C code)
10//!
11//! Alexandre Adomnicai, Nanyang Technological University, Singapore
12//! <alexandre.adomnicai@ntu.edu.sg>
13//!
14//! Originally licensed MIT. Relicensed as Apache 2.0+MIT with permission.
15
16#![allow(clippy::unreadable_literal)]
17
18use crate::Block;
19use cipher::{
20    consts::{U16, U24, U32},
21    generic_array::GenericArray,
22};
23use core::convert::TryInto;
24
25/// AES block batch size for this implementation
26pub(crate) const FIXSLICE_BLOCKS: usize = 2;
27
28/// AES-128 round keys
29pub(crate) type FixsliceKeys128 = [u32; 88];
30
31/// AES-192 round keys
32pub(crate) type FixsliceKeys192 = [u32; 104];
33
34/// AES-256 round keys
35pub(crate) type FixsliceKeys256 = [u32; 120];
36
37/// 256-bit internal state
38pub(crate) type State = [u32; 8];
39
40/// Fully bitsliced AES-128 key schedule to match the fully-fixsliced representation.
41pub(crate) fn aes128_key_schedule(key: &GenericArray<u8, U16>) -> FixsliceKeys128 {
42    let mut rkeys = [0u32; 88];
43
44    bitslice(&mut rkeys[..8], key, key);
45
46    let mut rk_off = 0;
47    for rcon in 0..10 {
48        memshift32(&mut rkeys, rk_off);
49        rk_off += 8;
50
51        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
52        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
53
54        if rcon < 8 {
55            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
56        } else {
57            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 8);
58            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 7);
59            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 5);
60            add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon - 4);
61        }
62
63        xor_columns(&mut rkeys, rk_off, 8, ror_distance(1, 3));
64    }
65
66    // Adjust to match fixslicing format
67    #[cfg(feature = "compact")]
68    {
69        for i in (8..88).step_by(16) {
70            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
71        }
72    }
73    #[cfg(not(feature = "compact"))]
74    {
75        for i in (8..72).step_by(32) {
76            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
77            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
78            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
79        }
80        inv_shift_rows_1(&mut rkeys[72..80]);
81    }
82
83    // Account for NOTs removed from sub_bytes
84    for i in 1..11 {
85        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
86    }
87
88    rkeys
89}
90
91/// Fully bitsliced AES-192 key schedule to match the fully-fixsliced representation.
92pub(crate) fn aes192_key_schedule(key: &GenericArray<u8, U24>) -> FixsliceKeys192 {
93    let mut rkeys = [0u32; 104];
94    let mut tmp = [0u32; 8];
95
96    bitslice(&mut rkeys[..8], &key[..16], &key[..16]);
97    bitslice(&mut tmp, &key[8..], &key[8..]);
98
99    let mut rcon = 0;
100    let mut rk_off = 8;
101
102    loop {
103        for i in 0..8 {
104            rkeys[rk_off + i] =
105                (0x0f0f0f0f & (tmp[i] >> 4)) | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4));
106        }
107
108        sub_bytes(&mut tmp);
109        sub_bytes_nots(&mut tmp);
110
111        add_round_constant_bit(&mut tmp, rcon);
112        rcon += 1;
113
114        for i in 0..8 {
115            let mut ti = rkeys[rk_off + i];
116            ti ^= 0x30303030 & ror(tmp[i], ror_distance(1, 1));
117            ti ^= 0xc0c0c0c0 & (ti << 2);
118            tmp[i] = ti;
119        }
120        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
121        rk_off += 8;
122
123        for i in 0..8 {
124            let ui = tmp[i];
125            let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4)) | (0xf0f0f0f0 & (ui << 4));
126            ti ^= 0x03030303 & (ui >> 6);
127            tmp[i] =
128                ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6));
129        }
130        rkeys[rk_off..(rk_off + 8)].copy_from_slice(&tmp);
131        rk_off += 8;
132
133        sub_bytes(&mut tmp);
134        sub_bytes_nots(&mut tmp);
135
136        add_round_constant_bit(&mut tmp, rcon);
137        rcon += 1;
138
139        for i in 0..8 {
140            let mut ti = (0x0f0f0f0f & (rkeys[(rk_off - 16) + i] >> 4))
141                | (0xf0f0f0f0 & (rkeys[(rk_off - 8) + i] << 4));
142            ti ^= 0x03030303 & ror(tmp[i], ror_distance(1, 3));
143            rkeys[rk_off + i] =
144                ti ^ (0xfcfcfcfc & (ti << 2)) ^ (0xf0f0f0f0 & (ti << 4)) ^ (0xc0c0c0c0 & (ti << 6));
145        }
146        rk_off += 8;
147
148        if rcon >= 8 {
149            break;
150        }
151
152        for i in 0..8 {
153            let ui = rkeys[(rk_off - 8) + i];
154            let mut ti = rkeys[(rk_off - 16) + i];
155            ti ^= 0x30303030 & (ui >> 2);
156            ti ^= 0xc0c0c0c0 & (ti << 2);
157            tmp[i] = ti;
158        }
159    }
160
161    // Adjust to match fixslicing format
162    #[cfg(feature = "compact")]
163    {
164        for i in (8..104).step_by(16) {
165            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
166        }
167    }
168    #[cfg(not(feature = "compact"))]
169    {
170        for i in (0..96).step_by(32) {
171            inv_shift_rows_1(&mut rkeys[(i + 8)..(i + 16)]);
172            inv_shift_rows_2(&mut rkeys[(i + 16)..(i + 24)]);
173            inv_shift_rows_3(&mut rkeys[(i + 24)..(i + 32)]);
174        }
175    }
176
177    // Account for NOTs removed from sub_bytes
178    for i in 1..13 {
179        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
180    }
181
182    rkeys
183}
184
185/// Fully bitsliced AES-256 key schedule to match the fully-fixsliced representation.
186pub(crate) fn aes256_key_schedule(key: &GenericArray<u8, U32>) -> FixsliceKeys256 {
187    let mut rkeys = [0u32; 120];
188
189    bitslice(&mut rkeys[..8], &key[..16], &key[..16]);
190    bitslice(&mut rkeys[8..16], &key[16..], &key[16..]);
191
192    let mut rk_off = 8;
193
194    let mut rcon = 0;
195    loop {
196        memshift32(&mut rkeys, rk_off);
197        rk_off += 8;
198
199        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
200        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
201
202        add_round_constant_bit(&mut rkeys[rk_off..(rk_off + 8)], rcon);
203        xor_columns(&mut rkeys, rk_off, 16, ror_distance(1, 3));
204        rcon += 1;
205
206        if rcon == 7 {
207            break;
208        }
209
210        memshift32(&mut rkeys, rk_off);
211        rk_off += 8;
212
213        sub_bytes(&mut rkeys[rk_off..(rk_off + 8)]);
214        sub_bytes_nots(&mut rkeys[rk_off..(rk_off + 8)]);
215
216        xor_columns(&mut rkeys, rk_off, 16, ror_distance(0, 3));
217    }
218
219    // Adjust to match fixslicing format
220    #[cfg(feature = "compact")]
221    {
222        for i in (8..120).step_by(16) {
223            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
224        }
225    }
226    #[cfg(not(feature = "compact"))]
227    {
228        for i in (8..104).step_by(32) {
229            inv_shift_rows_1(&mut rkeys[i..(i + 8)]);
230            inv_shift_rows_2(&mut rkeys[(i + 8)..(i + 16)]);
231            inv_shift_rows_3(&mut rkeys[(i + 16)..(i + 24)]);
232        }
233        inv_shift_rows_1(&mut rkeys[104..112]);
234    }
235
236    // Account for NOTs removed from sub_bytes
237    for i in 1..15 {
238        sub_bytes_nots(&mut rkeys[(i * 8)..(i * 8 + 8)]);
239    }
240
241    rkeys
242}
243
244/// Fully-fixsliced AES-128 decryption (the InvShiftRows is completely omitted).
245///
246/// Decrypts four blocks in-place and in parallel.
247pub(crate) fn aes128_decrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
248    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
249    let mut state = State::default();
250
251    bitslice(&mut state, &blocks[0], &blocks[1]);
252
253    add_round_key(&mut state, &rkeys[80..]);
254    inv_sub_bytes(&mut state);
255
256    #[cfg(not(feature = "compact"))]
257    {
258        inv_shift_rows_2(&mut state);
259    }
260
261    let mut rk_off = 72;
262    loop {
263        #[cfg(feature = "compact")]
264        {
265            inv_shift_rows_2(&mut state);
266        }
267
268        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
269        inv_mix_columns_1(&mut state);
270        inv_sub_bytes(&mut state);
271        rk_off -= 8;
272
273        if rk_off == 0 {
274            break;
275        }
276
277        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
278        inv_mix_columns_0(&mut state);
279        inv_sub_bytes(&mut state);
280        rk_off -= 8;
281
282        #[cfg(not(feature = "compact"))]
283        {
284            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
285            inv_mix_columns_3(&mut state);
286            inv_sub_bytes(&mut state);
287            rk_off -= 8;
288
289            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
290            inv_mix_columns_2(&mut state);
291            inv_sub_bytes(&mut state);
292            rk_off -= 8;
293        }
294    }
295
296    add_round_key(&mut state, &rkeys[..8]);
297
298    inv_bitslice(&state, blocks);
299}
300
301/// Fully-fixsliced AES-128 encryption (the ShiftRows is completely omitted).
302///
303/// Encrypts four blocks in-place and in parallel.
304pub(crate) fn aes128_encrypt(rkeys: &FixsliceKeys128, blocks: &mut [Block]) {
305    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
306    let mut state = State::default();
307
308    bitslice(&mut state, &blocks[0], &blocks[1]);
309
310    add_round_key(&mut state, &rkeys[..8]);
311
312    let mut rk_off = 8;
313    loop {
314        sub_bytes(&mut state);
315        mix_columns_1(&mut state);
316        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
317        rk_off += 8;
318
319        #[cfg(feature = "compact")]
320        {
321            shift_rows_2(&mut state);
322        }
323
324        if rk_off == 80 {
325            break;
326        }
327
328        #[cfg(not(feature = "compact"))]
329        {
330            sub_bytes(&mut state);
331            mix_columns_2(&mut state);
332            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
333            rk_off += 8;
334
335            sub_bytes(&mut state);
336            mix_columns_3(&mut state);
337            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
338            rk_off += 8;
339        }
340
341        sub_bytes(&mut state);
342        mix_columns_0(&mut state);
343        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
344        rk_off += 8;
345    }
346
347    #[cfg(not(feature = "compact"))]
348    {
349        shift_rows_2(&mut state);
350    }
351
352    sub_bytes(&mut state);
353    add_round_key(&mut state, &rkeys[80..]);
354
355    inv_bitslice(&state, blocks);
356}
357
358/// Fully-fixsliced AES-192 decryption (the InvShiftRows is completely omitted).
359///
360/// Decrypts four blocks in-place and in parallel.
361pub(crate) fn aes192_decrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
362    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
363    let mut state = State::default();
364
365    bitslice(&mut state, &blocks[0], &blocks[1]);
366
367    add_round_key(&mut state, &rkeys[96..]);
368    inv_sub_bytes(&mut state);
369
370    let mut rk_off = 88;
371    loop {
372        #[cfg(feature = "compact")]
373        {
374            inv_shift_rows_2(&mut state);
375        }
376        #[cfg(not(feature = "compact"))]
377        {
378            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
379            inv_mix_columns_3(&mut state);
380            inv_sub_bytes(&mut state);
381            rk_off -= 8;
382
383            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
384            inv_mix_columns_2(&mut state);
385            inv_sub_bytes(&mut state);
386            rk_off -= 8;
387        }
388
389        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
390        inv_mix_columns_1(&mut state);
391        inv_sub_bytes(&mut state);
392        rk_off -= 8;
393
394        if rk_off == 0 {
395            break;
396        }
397
398        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
399        inv_mix_columns_0(&mut state);
400        inv_sub_bytes(&mut state);
401        rk_off -= 8;
402    }
403
404    add_round_key(&mut state, &rkeys[..8]);
405
406    inv_bitslice(&state, blocks);
407}
408
409/// Fully-fixsliced AES-192 encryption (the ShiftRows is completely omitted).
410///
411/// Encrypts four blocks in-place and in parallel.
412pub(crate) fn aes192_encrypt(rkeys: &FixsliceKeys192, blocks: &mut [Block]) {
413    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
414    let mut state = State::default();
415
416    bitslice(&mut state, &blocks[0], &blocks[1]);
417
418    add_round_key(&mut state, &rkeys[..8]);
419
420    let mut rk_off = 8;
421    loop {
422        sub_bytes(&mut state);
423        mix_columns_1(&mut state);
424        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
425        rk_off += 8;
426
427        #[cfg(feature = "compact")]
428        {
429            shift_rows_2(&mut state);
430        }
431        #[cfg(not(feature = "compact"))]
432        {
433            sub_bytes(&mut state);
434            mix_columns_2(&mut state);
435            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
436            rk_off += 8;
437
438            sub_bytes(&mut state);
439            mix_columns_3(&mut state);
440            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
441            rk_off += 8;
442        }
443
444        if rk_off == 96 {
445            break;
446        }
447
448        sub_bytes(&mut state);
449        mix_columns_0(&mut state);
450        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
451        rk_off += 8;
452    }
453
454    sub_bytes(&mut state);
455    add_round_key(&mut state, &rkeys[96..]);
456
457    inv_bitslice(&state, blocks);
458}
459
460/// Fully-fixsliced AES-256 decryption (the InvShiftRows is completely omitted).
461///
462/// Decrypts four blocks in-place and in parallel.
463pub(crate) fn aes256_decrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
464    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
465    let mut state = State::default();
466
467    bitslice(&mut state, &blocks[0], &blocks[1]);
468
469    add_round_key(&mut state, &rkeys[112..]);
470    inv_sub_bytes(&mut state);
471
472    #[cfg(not(feature = "compact"))]
473    {
474        inv_shift_rows_2(&mut state);
475    }
476
477    let mut rk_off = 104;
478    loop {
479        #[cfg(feature = "compact")]
480        {
481            inv_shift_rows_2(&mut state);
482        }
483
484        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
485        inv_mix_columns_1(&mut state);
486        inv_sub_bytes(&mut state);
487        rk_off -= 8;
488
489        if rk_off == 0 {
490            break;
491        }
492
493        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
494        inv_mix_columns_0(&mut state);
495        inv_sub_bytes(&mut state);
496        rk_off -= 8;
497
498        #[cfg(not(feature = "compact"))]
499        {
500            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
501            inv_mix_columns_3(&mut state);
502            inv_sub_bytes(&mut state);
503            rk_off -= 8;
504
505            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
506            inv_mix_columns_2(&mut state);
507            inv_sub_bytes(&mut state);
508            rk_off -= 8;
509        }
510    }
511
512    add_round_key(&mut state, &rkeys[..8]);
513
514    inv_bitslice(&state, blocks);
515}
516
517/// Fully-fixsliced AES-256 encryption (the ShiftRows is completely omitted).
518///
519/// Encrypts four blocks in-place and in parallel.
520pub(crate) fn aes256_encrypt(rkeys: &FixsliceKeys256, blocks: &mut [Block]) {
521    debug_assert_eq!(blocks.len(), FIXSLICE_BLOCKS);
522    let mut state = State::default();
523
524    bitslice(&mut state, &blocks[0], &blocks[1]);
525
526    add_round_key(&mut state, &rkeys[..8]);
527
528    let mut rk_off = 8;
529    loop {
530        sub_bytes(&mut state);
531        mix_columns_1(&mut state);
532        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
533        rk_off += 8;
534
535        #[cfg(feature = "compact")]
536        {
537            shift_rows_2(&mut state);
538        }
539
540        if rk_off == 112 {
541            break;
542        }
543
544        #[cfg(not(feature = "compact"))]
545        {
546            sub_bytes(&mut state);
547            mix_columns_2(&mut state);
548            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
549            rk_off += 8;
550
551            sub_bytes(&mut state);
552            mix_columns_3(&mut state);
553            add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
554            rk_off += 8;
555        }
556
557        sub_bytes(&mut state);
558        mix_columns_0(&mut state);
559        add_round_key(&mut state, &rkeys[rk_off..(rk_off + 8)]);
560        rk_off += 8;
561    }
562
563    #[cfg(not(feature = "compact"))]
564    {
565        shift_rows_2(&mut state);
566    }
567
568    sub_bytes(&mut state);
569    add_round_key(&mut state, &rkeys[112..]);
570
571    inv_bitslice(&state, blocks);
572}
573
574/// Note that the 4 bitwise NOT (^= 0xffffffff) are accounted for here so that it is a true
575/// inverse of 'sub_bytes'.
576fn inv_sub_bytes(state: &mut [u32]) {
577    debug_assert_eq!(state.len(), 8);
578
579    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
580    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
581
582    let u7 = state[0];
583    let u6 = state[1];
584    let u5 = state[2];
585    let u4 = state[3];
586    let u3 = state[4];
587    let u2 = state[5];
588    let u1 = state[6];
589    let u0 = state[7];
590
591    let t23 = u0 ^ u3;
592    let t8 = u1 ^ t23;
593    let m2 = t23 & t8;
594    let t4 = u4 ^ t8;
595    let t22 = u1 ^ u3;
596    let t2 = u0 ^ u1;
597    let t1 = u3 ^ u4;
598    // t23 -> stack
599    let t9 = u7 ^ t1;
600    // t8 -> stack
601    let m7 = t22 & t9;
602    // t9 -> stack
603    let t24 = u4 ^ u7;
604    // m7 -> stack
605    let t10 = t2 ^ t24;
606    // u4 -> stack
607    let m14 = t2 & t10;
608    let r5 = u6 ^ u7;
609    // m2 -> stack
610    let t3 = t1 ^ r5;
611    // t2 -> stack
612    let t13 = t2 ^ r5;
613    let t19 = t22 ^ r5;
614    // t3 -> stack
615    let t17 = u2 ^ t19;
616    // t4 -> stack
617    let t25 = u2 ^ t1;
618    let r13 = u1 ^ u6;
619    // t25 -> stack
620    let t20 = t24 ^ r13;
621    // t17 -> stack
622    let m9 = t20 & t17;
623    // t20 -> stack
624    let r17 = u2 ^ u5;
625    // t22 -> stack
626    let t6 = t22 ^ r17;
627    // t13 -> stack
628    let m1 = t13 & t6;
629    let y5 = u0 ^ r17;
630    let m4 = t19 & y5;
631    let m5 = m4 ^ m1;
632    let m17 = m5 ^ t24;
633    let r18 = u5 ^ u6;
634    let t27 = t1 ^ r18;
635    let t15 = t10 ^ t27;
636    // t6 -> stack
637    let m11 = t1 & t15;
638    let m15 = m14 ^ m11;
639    let m21 = m17 ^ m15;
640    // t1 -> stack
641    // t4 <- stack
642    let m12 = t4 & t27;
643    let m13 = m12 ^ m11;
644    let t14 = t10 ^ r18;
645    let m3 = t14 ^ m1;
646    // m2 <- stack
647    let m16 = m3 ^ m2;
648    let m20 = m16 ^ m13;
649    // u4 <- stack
650    let r19 = u2 ^ u4;
651    let t16 = r13 ^ r19;
652    // t3 <- stack
653    let t26 = t3 ^ t16;
654    let m6 = t3 & t16;
655    let m8 = t26 ^ m6;
656    // t10 -> stack
657    // m7 <- stack
658    let m18 = m8 ^ m7;
659    let m22 = m18 ^ m13;
660    let m25 = m22 & m20;
661    let m26 = m21 ^ m25;
662    let m10 = m9 ^ m6;
663    let m19 = m10 ^ m15;
664    // t25 <- stack
665    let m23 = m19 ^ t25;
666    let m28 = m23 ^ m25;
667    let m24 = m22 ^ m23;
668    let m30 = m26 & m24;
669    let m39 = m23 ^ m30;
670    let m48 = m39 & y5;
671    let m57 = m39 & t19;
672    // m48 -> stack
673    let m36 = m24 ^ m25;
674    let m31 = m20 & m23;
675    let m27 = m20 ^ m21;
676    let m32 = m27 & m31;
677    let m29 = m28 & m27;
678    let m37 = m21 ^ m29;
679    // m39 -> stack
680    let m42 = m37 ^ m39;
681    let m52 = m42 & t15;
682    // t27 -> stack
683    // t1 <- stack
684    let m61 = m42 & t1;
685    let p0 = m52 ^ m61;
686    let p16 = m57 ^ m61;
687    // m57 -> stack
688    // t20 <- stack
689    let m60 = m37 & t20;
690    // p16 -> stack
691    // t17 <- stack
692    let m51 = m37 & t17;
693    let m33 = m27 ^ m25;
694    let m38 = m32 ^ m33;
695    let m43 = m37 ^ m38;
696    let m49 = m43 & t16;
697    let p6 = m49 ^ m60;
698    let p13 = m49 ^ m51;
699    let m58 = m43 & t3;
700    // t9 <- stack
701    let m50 = m38 & t9;
702    // t22 <- stack
703    let m59 = m38 & t22;
704    // p6 -> stack
705    let p1 = m58 ^ m59;
706    let p7 = p0 ^ p1;
707    let m34 = m21 & m22;
708    let m35 = m24 & m34;
709    let m40 = m35 ^ m36;
710    let m41 = m38 ^ m40;
711    let m45 = m42 ^ m41;
712    // t27 <- stack
713    let m53 = m45 & t27;
714    let p8 = m50 ^ m53;
715    let p23 = p7 ^ p8;
716    // t4 <- stack
717    let m62 = m45 & t4;
718    let p14 = m49 ^ m62;
719    let s6 = p14 ^ p23;
720    // t10 <- stack
721    let m54 = m41 & t10;
722    let p2 = m54 ^ m62;
723    let p22 = p2 ^ p7;
724    let s0 = p13 ^ p22;
725    let p17 = m58 ^ p2;
726    let p15 = m54 ^ m59;
727    // t2 <- stack
728    let m63 = m41 & t2;
729    // m39 <- stack
730    let m44 = m39 ^ m40;
731    // p17 -> stack
732    // t6 <- stack
733    let m46 = m44 & t6;
734    let p5 = m46 ^ m51;
735    // p23 -> stack
736    let p18 = m63 ^ p5;
737    let p24 = p5 ^ p7;
738    // m48 <- stack
739    let p12 = m46 ^ m48;
740    let s3 = p12 ^ p22;
741    // t13 <- stack
742    let m55 = m44 & t13;
743    let p9 = m55 ^ m63;
744    // p16 <- stack
745    let s7 = p9 ^ p16;
746    // t8 <- stack
747    let m47 = m40 & t8;
748    let p3 = m47 ^ m50;
749    let p19 = p2 ^ p3;
750    let s5 = p19 ^ p24;
751    let p11 = p0 ^ p3;
752    let p26 = p9 ^ p11;
753    // t23 <- stack
754    let m56 = m40 & t23;
755    let p4 = m48 ^ m56;
756    // p6 <- stack
757    let p20 = p4 ^ p6;
758    let p29 = p15 ^ p20;
759    let s1 = p26 ^ p29;
760    // m57 <- stack
761    let p10 = m57 ^ p4;
762    let p27 = p10 ^ p18;
763    // p23 <- stack
764    let s4 = p23 ^ p27;
765    let p25 = p6 ^ p10;
766    let p28 = p11 ^ p25;
767    // p17 <- stack
768    let s2 = p17 ^ p28;
769
770    state[0] = s7;
771    state[1] = s6;
772    state[2] = s5;
773    state[3] = s4;
774    state[4] = s3;
775    state[5] = s2;
776    state[6] = s1;
777    state[7] = s0;
778}
779
780/// Bitsliced implementation of the AES Sbox based on Boyar, Peralta and Calik.
781///
782/// See: <http://www.cs.yale.edu/homes/peralta/CircuitStuff/SLP_AES_113.txt>
783///
784/// Note that the 4 bitwise NOT (^= 0xffffffff) are moved to the key schedule.
785fn sub_bytes(state: &mut [u32]) {
786    debug_assert_eq!(state.len(), 8);
787
788    // Scheduled using https://github.com/Ko-/aes-armcortexm/tree/public/scheduler
789    // Inline "stack" comments reflect suggested stores and loads (ARM Cortex-M3 and M4)
790
791    let u7 = state[0];
792    let u6 = state[1];
793    let u5 = state[2];
794    let u4 = state[3];
795    let u3 = state[4];
796    let u2 = state[5];
797    let u1 = state[6];
798    let u0 = state[7];
799
800    let y14 = u3 ^ u5;
801    let y13 = u0 ^ u6;
802    let y12 = y13 ^ y14;
803    let t1 = u4 ^ y12;
804    let y15 = t1 ^ u5;
805    let t2 = y12 & y15;
806    let y6 = y15 ^ u7;
807    let y20 = t1 ^ u1;
808    // y12 -> stack
809    let y9 = u0 ^ u3;
810    // y20 -> stack
811    let y11 = y20 ^ y9;
812    // y9 -> stack
813    let t12 = y9 & y11;
814    // y6 -> stack
815    let y7 = u7 ^ y11;
816    let y8 = u0 ^ u5;
817    let t0 = u1 ^ u2;
818    let y10 = y15 ^ t0;
819    // y15 -> stack
820    let y17 = y10 ^ y11;
821    // y14 -> stack
822    let t13 = y14 & y17;
823    let t14 = t13 ^ t12;
824    // y17 -> stack
825    let y19 = y10 ^ y8;
826    // y10 -> stack
827    let t15 = y8 & y10;
828    let t16 = t15 ^ t12;
829    let y16 = t0 ^ y11;
830    // y11 -> stack
831    let y21 = y13 ^ y16;
832    // y13 -> stack
833    let t7 = y13 & y16;
834    // y16 -> stack
835    let y18 = u0 ^ y16;
836    let y1 = t0 ^ u7;
837    let y4 = y1 ^ u3;
838    // u7 -> stack
839    let t5 = y4 & u7;
840    let t6 = t5 ^ t2;
841    let t18 = t6 ^ t16;
842    let t22 = t18 ^ y19;
843    let y2 = y1 ^ u0;
844    let t10 = y2 & y7;
845    let t11 = t10 ^ t7;
846    let t20 = t11 ^ t16;
847    let t24 = t20 ^ y18;
848    let y5 = y1 ^ u6;
849    let t8 = y5 & y1;
850    let t9 = t8 ^ t7;
851    let t19 = t9 ^ t14;
852    let t23 = t19 ^ y21;
853    let y3 = y5 ^ y8;
854    // y6 <- stack
855    let t3 = y3 & y6;
856    let t4 = t3 ^ t2;
857    // y20 <- stack
858    let t17 = t4 ^ y20;
859    let t21 = t17 ^ t14;
860    let t26 = t21 & t23;
861    let t27 = t24 ^ t26;
862    let t31 = t22 ^ t26;
863    let t25 = t21 ^ t22;
864    // y4 -> stack
865    let t28 = t25 & t27;
866    let t29 = t28 ^ t22;
867    let z14 = t29 & y2;
868    let z5 = t29 & y7;
869    let t30 = t23 ^ t24;
870    let t32 = t31 & t30;
871    let t33 = t32 ^ t24;
872    let t35 = t27 ^ t33;
873    let t36 = t24 & t35;
874    let t38 = t27 ^ t36;
875    let t39 = t29 & t38;
876    let t40 = t25 ^ t39;
877    let t43 = t29 ^ t40;
878    // y16 <- stack
879    let z3 = t43 & y16;
880    let tc12 = z3 ^ z5;
881    // tc12 -> stack
882    // y13 <- stack
883    let z12 = t43 & y13;
884    let z13 = t40 & y5;
885    let z4 = t40 & y1;
886    let tc6 = z3 ^ z4;
887    let t34 = t23 ^ t33;
888    let t37 = t36 ^ t34;
889    let t41 = t40 ^ t37;
890    // y10 <- stack
891    let z8 = t41 & y10;
892    let z17 = t41 & y8;
893    let t44 = t33 ^ t37;
894    // y15 <- stack
895    let z0 = t44 & y15;
896    // z17 -> stack
897    // y12 <- stack
898    let z9 = t44 & y12;
899    let z10 = t37 & y3;
900    let z1 = t37 & y6;
901    let tc5 = z1 ^ z0;
902    let tc11 = tc6 ^ tc5;
903    // y4 <- stack
904    let z11 = t33 & y4;
905    let t42 = t29 ^ t33;
906    let t45 = t42 ^ t41;
907    // y17 <- stack
908    let z7 = t45 & y17;
909    let tc8 = z7 ^ tc6;
910    // y14 <- stack
911    let z16 = t45 & y14;
912    // y11 <- stack
913    let z6 = t42 & y11;
914    let tc16 = z6 ^ tc8;
915    // z14 -> stack
916    // y9 <- stack
917    let z15 = t42 & y9;
918    let tc20 = z15 ^ tc16;
919    let tc1 = z15 ^ z16;
920    let tc2 = z10 ^ tc1;
921    let tc21 = tc2 ^ z11;
922    let tc3 = z9 ^ tc2;
923    let s0 = tc3 ^ tc16;
924    let s3 = tc3 ^ tc11;
925    let s1 = s3 ^ tc16;
926    let tc13 = z13 ^ tc1;
927    // u7 <- stack
928    let z2 = t33 & u7;
929    let tc4 = z0 ^ z2;
930    let tc7 = z12 ^ tc4;
931    let tc9 = z8 ^ tc7;
932    let tc10 = tc8 ^ tc9;
933    // z14 <- stack
934    let tc17 = z14 ^ tc10;
935    let s5 = tc21 ^ tc17;
936    let tc26 = tc17 ^ tc20;
937    // z17 <- stack
938    let s2 = tc26 ^ z17;
939    // tc12 <- stack
940    let tc14 = tc4 ^ tc12;
941    let tc18 = tc13 ^ tc14;
942    let s6 = tc10 ^ tc18;
943    let s7 = z12 ^ tc18;
944    let s4 = tc14 ^ s3;
945
946    state[0] = s7;
947    state[1] = s6;
948    state[2] = s5;
949    state[3] = s4;
950    state[4] = s3;
951    state[5] = s2;
952    state[6] = s1;
953    state[7] = s0;
954}
955
956/// NOT operations that are omitted in S-box
957#[inline]
958fn sub_bytes_nots(state: &mut [u32]) {
959    debug_assert_eq!(state.len(), 8);
960    state[0] ^= 0xffffffff;
961    state[1] ^= 0xffffffff;
962    state[5] ^= 0xffffffff;
963    state[6] ^= 0xffffffff;
964}
965
966/// Computation of the MixColumns transformation in the fixsliced representation, with different
967/// rotations used according to the round number mod 4.
968///
969/// Based on Käsper-Schwabe, similar to https://github.com/Ko-/aes-armcortexm.
970macro_rules! define_mix_columns {
971    (
972        $name:ident,
973        $name_inv:ident,
974        $first_rotate:path,
975        $second_rotate:path
976    ) => {
977        #[rustfmt::skip]
978        fn $name(state: &mut State) {
979            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
980                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
981            );
982            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
983                $first_rotate(a0),
984                $first_rotate(a1),
985                $first_rotate(a2),
986                $first_rotate(a3),
987                $first_rotate(a4),
988                $first_rotate(a5),
989                $first_rotate(a6),
990                $first_rotate(a7),
991            );
992            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
993                a0 ^ b0,
994                a1 ^ b1,
995                a2 ^ b2,
996                a3 ^ b3,
997                a4 ^ b4,
998                a5 ^ b5,
999                a6 ^ b6,
1000                a7 ^ b7,
1001            );
1002            state[0] = b0      ^ c7 ^ $second_rotate(c0);
1003            state[1] = b1 ^ c0 ^ c7 ^ $second_rotate(c1);
1004            state[2] = b2 ^ c1      ^ $second_rotate(c2);
1005            state[3] = b3 ^ c2 ^ c7 ^ $second_rotate(c3);
1006            state[4] = b4 ^ c3 ^ c7 ^ $second_rotate(c4);
1007            state[5] = b5 ^ c4      ^ $second_rotate(c5);
1008            state[6] = b6 ^ c5      ^ $second_rotate(c6);
1009            state[7] = b7 ^ c6      ^ $second_rotate(c7);
1010        }
1011
1012        #[rustfmt::skip]
1013        fn $name_inv(state: &mut State) {
1014            let (a0, a1, a2, a3, a4, a5, a6, a7) = (
1015                state[0], state[1], state[2], state[3], state[4], state[5], state[6], state[7]
1016            );
1017            let (b0, b1, b2, b3, b4, b5, b6, b7) = (
1018                $first_rotate(a0),
1019                $first_rotate(a1),
1020                $first_rotate(a2),
1021                $first_rotate(a3),
1022                $first_rotate(a4),
1023                $first_rotate(a5),
1024                $first_rotate(a6),
1025                $first_rotate(a7),
1026            );
1027            let (c0, c1, c2, c3, c4, c5, c6, c7) = (
1028                a0 ^ b0,
1029                a1 ^ b1,
1030                a2 ^ b2,
1031                a3 ^ b3,
1032                a4 ^ b4,
1033                a5 ^ b5,
1034                a6 ^ b6,
1035                a7 ^ b7,
1036            );
1037            let (d0, d1, d2, d3, d4, d5, d6, d7) = (
1038                a0      ^ c7,
1039                a1 ^ c0 ^ c7,
1040                a2 ^ c1,
1041                a3 ^ c2 ^ c7,
1042                a4 ^ c3 ^ c7,
1043                a5 ^ c4,
1044                a6 ^ c5,
1045                a7 ^ c6,
1046            );
1047            let (e0, e1, e2, e3, e4, e5, e6, e7) = (
1048                c0      ^ d6,
1049                c1      ^ d6 ^ d7,
1050                c2 ^ d0      ^ d7,
1051                c3 ^ d1 ^ d6,
1052                c4 ^ d2 ^ d6 ^ d7,
1053                c5 ^ d3      ^ d7,
1054                c6 ^ d4,
1055                c7 ^ d5,
1056            );
1057            state[0] = d0 ^ e0 ^ $second_rotate(e0);
1058            state[1] = d1 ^ e1 ^ $second_rotate(e1);
1059            state[2] = d2 ^ e2 ^ $second_rotate(e2);
1060            state[3] = d3 ^ e3 ^ $second_rotate(e3);
1061            state[4] = d4 ^ e4 ^ $second_rotate(e4);
1062            state[5] = d5 ^ e5 ^ $second_rotate(e5);
1063            state[6] = d6 ^ e6 ^ $second_rotate(e6);
1064            state[7] = d7 ^ e7 ^ $second_rotate(e7);
1065        }
1066    }
1067}
1068
1069define_mix_columns!(
1070    mix_columns_0,
1071    inv_mix_columns_0,
1072    rotate_rows_1,
1073    rotate_rows_2
1074);
1075
1076define_mix_columns!(
1077    mix_columns_1,
1078    inv_mix_columns_1,
1079    rotate_rows_and_columns_1_1,
1080    rotate_rows_and_columns_2_2
1081);
1082
1083#[cfg(not(feature = "compact"))]
1084define_mix_columns!(
1085    mix_columns_2,
1086    inv_mix_columns_2,
1087    rotate_rows_and_columns_1_2,
1088    rotate_rows_2
1089);
1090
1091#[cfg(not(feature = "compact"))]
1092define_mix_columns!(
1093    mix_columns_3,
1094    inv_mix_columns_3,
1095    rotate_rows_and_columns_1_3,
1096    rotate_rows_and_columns_2_2
1097);
1098
1099#[inline]
1100fn delta_swap_1(a: &mut u32, shift: u32, mask: u32) {
1101    let t = (*a ^ ((*a) >> shift)) & mask;
1102    *a ^= t ^ (t << shift);
1103}
1104
1105#[inline]
1106fn delta_swap_2(a: &mut u32, b: &mut u32, shift: u32, mask: u32) {
1107    let t = (*a ^ ((*b) >> shift)) & mask;
1108    *a ^= t;
1109    *b ^= t << shift;
1110}
1111
1112/// Applies ShiftRows once on an AES state (or key).
1113#[cfg(any(not(feature = "compact"), feature = "hazmat"))]
1114#[inline]
1115fn shift_rows_1(state: &mut [u32]) {
1116    debug_assert_eq!(state.len(), 8);
1117    for x in state.iter_mut() {
1118        delta_swap_1(x, 4, 0x0c0f0300);
1119        delta_swap_1(x, 2, 0x33003300);
1120    }
1121}
1122
1123/// Applies ShiftRows twice on an AES state (or key).
1124#[inline]
1125fn shift_rows_2(state: &mut [u32]) {
1126    debug_assert_eq!(state.len(), 8);
1127    for x in state.iter_mut() {
1128        delta_swap_1(x, 4, 0x0f000f00);
1129    }
1130}
1131
1132/// Applies ShiftRows three times on an AES state (or key).
1133#[inline]
1134fn shift_rows_3(state: &mut [u32]) {
1135    debug_assert_eq!(state.len(), 8);
1136    for x in state.iter_mut() {
1137        delta_swap_1(x, 4, 0x030f0c00);
1138        delta_swap_1(x, 2, 0x33003300);
1139    }
1140}
1141
1142#[inline(always)]
1143fn inv_shift_rows_1(state: &mut [u32]) {
1144    shift_rows_3(state);
1145}
1146
1147#[inline(always)]
1148fn inv_shift_rows_2(state: &mut [u32]) {
1149    shift_rows_2(state);
1150}
1151
1152#[cfg(not(feature = "compact"))]
1153#[inline(always)]
1154fn inv_shift_rows_3(state: &mut [u32]) {
1155    shift_rows_1(state);
1156}
1157
1158/// XOR the columns after the S-box during the key schedule round function.
1159///
1160/// The `idx_xor` parameter refers to the index of the previous round key that is
1161/// involved in the XOR computation (should be 8 and 16 for AES-128 and AES-256,
1162/// respectively).
1163///
1164/// The `idx_ror` parameter refers to the rotation value, which varies between the
1165/// different key schedules.
1166fn xor_columns(rkeys: &mut [u32], offset: usize, idx_xor: usize, idx_ror: u32) {
1167    for i in 0..8 {
1168        let off_i = offset + i;
1169        let rk = rkeys[off_i - idx_xor] ^ (0x03030303 & ror(rkeys[off_i], idx_ror));
1170        rkeys[off_i] =
1171            rk ^ (0xfcfcfcfc & (rk << 2)) ^ (0xf0f0f0f0 & (rk << 4)) ^ (0xc0c0c0c0 & (rk << 6));
1172    }
1173}
1174
1175/// Bitslice two 128-bit input blocks input0, input1 into a 256-bit internal state.
1176fn bitslice(output: &mut [u32], input0: &[u8], input1: &[u8]) {
1177    debug_assert_eq!(output.len(), 8);
1178    debug_assert_eq!(input0.len(), 16);
1179    debug_assert_eq!(input1.len(), 16);
1180
1181    // Bitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at an
1182    // 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the
1183    // index is initially ([b]lock, [c]olumn, [r]ow, [p]osition):
1184    //     b0 c1 c0 r1 r0 p2 p1 p0
1185    //
1186    // The desired bitsliced data groups first by bit position, then row, column, block:
1187    //     p2 p1 p0 r1 r0 c1 c0 b0
1188
1189    // Interleave the columns on input (note the order of input)
1190    //     b0 c1 c0 __ __ __ __ __ => c1 c0 b0 __ __ __ __ __
1191    let mut t0 = u32::from_le_bytes(input0[0x00..0x04].try_into().unwrap());
1192    let mut t2 = u32::from_le_bytes(input0[0x04..0x08].try_into().unwrap());
1193    let mut t4 = u32::from_le_bytes(input0[0x08..0x0c].try_into().unwrap());
1194    let mut t6 = u32::from_le_bytes(input0[0x0c..0x10].try_into().unwrap());
1195    let mut t1 = u32::from_le_bytes(input1[0x00..0x04].try_into().unwrap());
1196    let mut t3 = u32::from_le_bytes(input1[0x04..0x08].try_into().unwrap());
1197    let mut t5 = u32::from_le_bytes(input1[0x08..0x0c].try_into().unwrap());
1198    let mut t7 = u32::from_le_bytes(input1[0x0c..0x10].try_into().unwrap());
1199
1200    // Bit Index Swap 5 <-> 0:
1201    //     __ __ b0 __ __ __ __ p0 => __ __ p0 __ __ __ __ b0
1202    let m0 = 0x55555555;
1203    delta_swap_2(&mut t1, &mut t0, 1, m0);
1204    delta_swap_2(&mut t3, &mut t2, 1, m0);
1205    delta_swap_2(&mut t5, &mut t4, 1, m0);
1206    delta_swap_2(&mut t7, &mut t6, 1, m0);
1207
1208    // Bit Index Swap 6 <-> 1:
1209    //     __ c0 __ __ __ __ p1 __ => __ p1 __ __ __ __ c0 __
1210    let m1 = 0x33333333;
1211    delta_swap_2(&mut t2, &mut t0, 2, m1);
1212    delta_swap_2(&mut t3, &mut t1, 2, m1);
1213    delta_swap_2(&mut t6, &mut t4, 2, m1);
1214    delta_swap_2(&mut t7, &mut t5, 2, m1);
1215
1216    // Bit Index Swap 7 <-> 2:
1217    //     c1 __ __ __ __ p2 __ __ => p2 __ __ __ __ c1 __ __
1218    let m2 = 0x0f0f0f0f;
1219    delta_swap_2(&mut t4, &mut t0, 4, m2);
1220    delta_swap_2(&mut t5, &mut t1, 4, m2);
1221    delta_swap_2(&mut t6, &mut t2, 4, m2);
1222    delta_swap_2(&mut t7, &mut t3, 4, m2);
1223
1224    // Final bitsliced bit index, as desired:
1225    //     p2 p1 p0 r1 r0 c1 c0 b0
1226    output[0] = t0;
1227    output[1] = t1;
1228    output[2] = t2;
1229    output[3] = t3;
1230    output[4] = t4;
1231    output[5] = t5;
1232    output[6] = t6;
1233    output[7] = t7;
1234}
1235
1236/// Un-bitslice a 256-bit internal state into two 128-bit blocks of output.
1237fn inv_bitslice(input: &[u32], output: &mut [Block]) {
1238    debug_assert_eq!(input.len(), 8);
1239    debug_assert_eq!(output.len(), 2);
1240
1241    // Unbitslicing is a bit index manipulation. 256 bits of data means each bit is positioned at
1242    // an 8-bit index. AES data is 2 blocks, each one a 4x4 column-major matrix of bytes, so the
1243    // desired index for the output is ([b]lock, [c]olumn, [r]ow, [p]osition):
1244    //     b0 c1 c0 r1 r0 p2 p1 p0
1245    //
1246    // The initially bitsliced data groups first by bit position, then row, column, block:
1247    //     p2 p1 p0 r1 r0 c1 c0 b0
1248
1249    let mut t0 = input[0];
1250    let mut t1 = input[1];
1251    let mut t2 = input[2];
1252    let mut t3 = input[3];
1253    let mut t4 = input[4];
1254    let mut t5 = input[5];
1255    let mut t6 = input[6];
1256    let mut t7 = input[7];
1257
1258    // TODO: these bit index swaps are identical to those in 'packing'
1259
1260    // Bit Index Swap 5 <-> 0:
1261    //     __ __ p0 __ __ __ __ b0 => __ __ b0 __ __ __ __ p0
1262    let m0 = 0x55555555;
1263    delta_swap_2(&mut t1, &mut t0, 1, m0);
1264    delta_swap_2(&mut t3, &mut t2, 1, m0);
1265    delta_swap_2(&mut t5, &mut t4, 1, m0);
1266    delta_swap_2(&mut t7, &mut t6, 1, m0);
1267
1268    // Bit Index Swap 6 <-> 1:
1269    //     __ p1 __ __ __ __ c0 __ => __ c0 __ __ __ __ p1 __
1270    let m1 = 0x33333333;
1271    delta_swap_2(&mut t2, &mut t0, 2, m1);
1272    delta_swap_2(&mut t3, &mut t1, 2, m1);
1273    delta_swap_2(&mut t6, &mut t4, 2, m1);
1274    delta_swap_2(&mut t7, &mut t5, 2, m1);
1275
1276    // Bit Index Swap 7 <-> 2:
1277    //     p2 __ __ __ __ c1 __ __ => c1 __ __ __ __ p2 __ __
1278    let m2 = 0x0f0f0f0f;
1279    delta_swap_2(&mut t4, &mut t0, 4, m2);
1280    delta_swap_2(&mut t5, &mut t1, 4, m2);
1281    delta_swap_2(&mut t6, &mut t2, 4, m2);
1282    delta_swap_2(&mut t7, &mut t3, 4, m2);
1283
1284    // De-interleave the columns on output (note the order of output)
1285    //     c1 c0 b0 __ __ __ __ __ => b0 c1 c0 __ __ __ __ __
1286    output[0][0x00..0x04].copy_from_slice(&t0.to_le_bytes());
1287    output[0][0x04..0x08].copy_from_slice(&t2.to_le_bytes());
1288    output[0][0x08..0x0c].copy_from_slice(&t4.to_le_bytes());
1289    output[0][0x0c..0x10].copy_from_slice(&t6.to_le_bytes());
1290    output[1][0x00..0x04].copy_from_slice(&t1.to_le_bytes());
1291    output[1][0x04..0x08].copy_from_slice(&t3.to_le_bytes());
1292    output[1][0x08..0x0c].copy_from_slice(&t5.to_le_bytes());
1293    output[1][0x0c..0x10].copy_from_slice(&t7.to_le_bytes());
1294
1295    // Final AES bit index, as desired:
1296    //     b0 c1 c0 r1 r0 p2 p1 p0
1297}
1298
1299/// Copy 32-bytes within the provided slice to an 8-byte offset
1300fn memshift32(buffer: &mut [u32], src_offset: usize) {
1301    debug_assert_eq!(src_offset % 8, 0);
1302
1303    let dst_offset = src_offset + 8;
1304    debug_assert!(dst_offset + 8 <= buffer.len());
1305
1306    for i in (0..8).rev() {
1307        buffer[dst_offset + i] = buffer[src_offset + i];
1308    }
1309}
1310
1311/// XOR the round key to the internal state. The round keys are expected to be
1312/// pre-computed and to be packed in the fixsliced representation.
1313#[inline]
1314fn add_round_key(state: &mut State, rkey: &[u32]) {
1315    debug_assert_eq!(rkey.len(), 8);
1316    for (a, b) in state.iter_mut().zip(rkey) {
1317        *a ^= b;
1318    }
1319}
1320
1321#[inline(always)]
1322fn add_round_constant_bit(state: &mut [u32], bit: usize) {
1323    state[bit] ^= 0x0000c000;
1324}
1325
1326#[inline(always)]
1327fn ror(x: u32, y: u32) -> u32 {
1328    x.rotate_right(y)
1329}
1330
1331#[inline(always)]
1332fn ror_distance(rows: u32, cols: u32) -> u32 {
1333    (rows << 3) + (cols << 1)
1334}
1335
1336#[inline(always)]
1337fn rotate_rows_1(x: u32) -> u32 {
1338    ror(x, ror_distance(1, 0))
1339}
1340
1341#[inline(always)]
1342fn rotate_rows_2(x: u32) -> u32 {
1343    ror(x, ror_distance(2, 0))
1344}
1345
1346#[inline(always)]
1347#[rustfmt::skip]
1348fn rotate_rows_and_columns_1_1(x: u32) -> u32 {
1349    (ror(x, ror_distance(1, 1)) & 0x3f3f3f3f) |
1350    (ror(x, ror_distance(0, 1)) & 0xc0c0c0c0)
1351}
1352
1353#[cfg(not(feature = "compact"))]
1354#[inline(always)]
1355#[rustfmt::skip]
1356fn rotate_rows_and_columns_1_2(x: u32) -> u32 {
1357    (ror(x, ror_distance(1, 2)) & 0x0f0f0f0f) |
1358    (ror(x, ror_distance(0, 2)) & 0xf0f0f0f0)
1359}
1360
1361#[cfg(not(feature = "compact"))]
1362#[inline(always)]
1363#[rustfmt::skip]
1364fn rotate_rows_and_columns_1_3(x: u32) -> u32 {
1365    (ror(x, ror_distance(1, 3)) & 0x03030303) |
1366    (ror(x, ror_distance(0, 3)) & 0xfcfcfcfc)
1367}
1368
1369#[inline(always)]
1370#[rustfmt::skip]
1371fn rotate_rows_and_columns_2_2(x: u32) -> u32 {
1372    (ror(x, ror_distance(2, 2)) & 0x0f0f0f0f) |
1373    (ror(x, ror_distance(1, 2)) & 0xf0f0f0f0)
1374}
1375
1376/// Low-level "hazmat" AES functions.
1377///
1378/// Note: this isn't actually used in the `Aes128`/`Aes192`/`Aes256`
1379/// implementations in this crate, but instead provides raw access to
1380/// the AES round function gated under the `hazmat` crate feature.
1381#[cfg(feature = "hazmat")]
1382pub(crate) mod hazmat {
1383    use super::{
1384        bitslice, inv_bitslice, inv_mix_columns_0, inv_shift_rows_1, inv_sub_bytes, mix_columns_0,
1385        shift_rows_1, sub_bytes, sub_bytes_nots, State,
1386    };
1387    use crate::{Block, ParBlocks};
1388
1389    /// XOR the `src` block into the `dst` block in-place.
1390    fn xor_in_place(dst: &mut Block, src: &Block) {
1391        for (a, b) in dst.iter_mut().zip(src.as_slice()) {
1392            *a ^= *b;
1393        }
1394    }
1395
1396    /// Perform a bitslice operation, loading a single block.
1397    fn bitslice_block(block: &Block) -> State {
1398        let mut state = State::default();
1399        bitslice(&mut state, block, block);
1400        state
1401    }
1402
1403    /// Perform an inverse bitslice operation, extracting a single block.
1404    fn inv_bitslice_block(block: &mut Block, state: &State) {
1405        let mut out = [Block::default(); 2];
1406        inv_bitslice(state, &mut out);
1407        block.copy_from_slice(&out[0]);
1408    }
1409
1410    /// AES cipher (encrypt) round function.
1411    #[inline]
1412    pub(crate) fn cipher_round(block: &mut Block, round_key: &Block) {
1413        let mut state = bitslice_block(block);
1414        sub_bytes(&mut state);
1415        sub_bytes_nots(&mut state);
1416        shift_rows_1(&mut state);
1417        mix_columns_0(&mut state);
1418        inv_bitslice_block(block, &state);
1419        xor_in_place(block, round_key);
1420    }
1421
1422    /// AES cipher (encrypt) round function: parallel version.
1423    #[inline]
1424    pub(crate) fn cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
1425        for (chunk, keys) in blocks.chunks_exact_mut(2).zip(round_keys.chunks_exact(2)) {
1426            let mut state = State::default();
1427            bitslice(&mut state, &chunk[0], &chunk[1]);
1428            sub_bytes(&mut state);
1429            sub_bytes_nots(&mut state);
1430            shift_rows_1(&mut state);
1431            mix_columns_0(&mut state);
1432            inv_bitslice(&state, chunk);
1433
1434            for i in 0..2 {
1435                xor_in_place(&mut chunk[i], &keys[i]);
1436            }
1437        }
1438    }
1439
1440    /// AES cipher (encrypt) round function.
1441    #[inline]
1442    pub(crate) fn equiv_inv_cipher_round(block: &mut Block, round_key: &Block) {
1443        let mut state = bitslice_block(block);
1444        sub_bytes_nots(&mut state);
1445        inv_sub_bytes(&mut state);
1446        inv_shift_rows_1(&mut state);
1447        inv_mix_columns_0(&mut state);
1448        inv_bitslice_block(block, &state);
1449        xor_in_place(block, round_key);
1450    }
1451
1452    /// AES cipher (encrypt) round function: parallel version.
1453    #[inline]
1454    pub(crate) fn equiv_inv_cipher_round_par(blocks: &mut ParBlocks, round_keys: &ParBlocks) {
1455        for (chunk, keys) in blocks.chunks_exact_mut(2).zip(round_keys.chunks_exact(2)) {
1456            let mut state = State::default();
1457            bitslice(&mut state, &chunk[0], &chunk[1]);
1458            sub_bytes_nots(&mut state);
1459            inv_sub_bytes(&mut state);
1460            inv_shift_rows_1(&mut state);
1461            inv_mix_columns_0(&mut state);
1462            inv_bitslice(&state, chunk);
1463
1464            for i in 0..2 {
1465                xor_in_place(&mut chunk[i], &keys[i]);
1466            }
1467        }
1468    }
1469
1470    /// AES mix columns function.
1471    #[inline]
1472    pub(crate) fn mix_columns(block: &mut Block) {
1473        let mut state = bitslice_block(block);
1474        mix_columns_0(&mut state);
1475        inv_bitslice_block(block, &state);
1476    }
1477
1478    /// AES inverse mix columns function.
1479    #[inline]
1480    pub(crate) fn inv_mix_columns(block: &mut Block) {
1481        let mut state = bitslice_block(block);
1482        inv_mix_columns_0(&mut state);
1483        inv_bitslice_block(block, &state);
1484    }
1485}