// Amnesia Security AES Implementation
// Based on code (c) Jari Ruusu 2004 and (c) Dr. Brian Gladman 2001
// Modifications (c) Patrick Simmons 2010

//This is designed to work with loop-aes

// void aes_set_key(aes_context *cx, const unsigned char key[], const int key_len, const int f)
// void aes_encrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])
// void aes_decrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])

#if defined(USE_UNDERLINE)
# define aes_set_key _aes_set_key
# define aes_encrypt _aes_encrypt
# define aes_decrypt _aes_decrypt
#endif
#if !defined(ALIGN64BYTES)
# define ALIGN64BYTES 64
#endif

	.file	"aes-amnesia.S"
	.globl	aes_set_key
	.globl	aes_encrypt
	.globl	aes_decrypt

.section .rodata

copyright:
	.ascii "    \000"
	.ascii "Copyright (c) 2001, Dr Brian Gladman <brg@gladman.uk.net>, Worcester, UK.\000"
	.ascii "All rights reserved.\000"
	.ascii "    \000"
	.ascii "TERMS\000"
	.ascii "    \000"
	.ascii " Redistribution and use in source and binary forms, with or without\000"
	.ascii " modification, are permitted subject to the following conditions:\000"
	.ascii "    \000"
	.ascii " 1. Redistributions of source code must retain the above copyright\000"
	.ascii "    notice, this list of conditions and the following disclaimer.\000"
	.ascii "    \000"
	.ascii " 2. Redistributions in binary form must reproduce the above copyright\000"
	.ascii "    notice, this list of conditions and the following disclaimer in the\000"
	.ascii "    documentation and/or other materials provided with the distribution.\000"
	.ascii "    \000"
	.ascii " 3. The copyright holder's name must not be used to endorse or promote\000"
	.ascii "    any products derived from this software without his specific prior\000"
	.ascii "    written permission.\000"
	.ascii "    \000"
	.ascii " This software is provided 'as is' with no express or implied warranties\000"
	.ascii " of correctness or fitness for purpose.\000"
	.ascii "    \000"

#define tlen	1024	// length of each of 4 'xor' arrays (256 32-bit words)

// offsets in context structure

#define nkey	0	// key length, size 4
#define nrnd	4	// number of rounds, size 4
#define ekey	8	// encryption key schedule base address, size 256
#define dkey	264	// decryption key schedule base address, size 256

//#define DEBUG_HARNESS

#ifdef DEBUG_HARNESS
.extern dummy_wrmsr
.extern dummy_rdmsr

harness_wrmsr:
        RET
	PUSHQ %RAX
	PUSHQ %RCX
	PUSHQ %RDX
	PUSHQ %RSI
	PUSHQ %RDI
	PUSHQ %R8
	PUSHQ %R9
	PUSHQ %R10
	PUSHQ %R11
	MOVL %ECX, %EDI
	MOVL %EDX, %ESI
	MOVL %EAX, %EDX
	CALL dummy_wrmsr
	POPQ %R11
	POPQ %R10
	POPQ %R9
	POPQ %R8
	POPQ %RDI
	POPQ %RSI
	POPQ %RDX
	POPQ %RCX
	POPQ %RAX
	RET

harness_rdmsr:
        RET
	PUSHQ %RCX
	PUSHQ %RSI
	PUSHQ %RDI
	PUSHQ %R8
	PUSHQ %R9
	PUSHQ %R10
	PUSHQ %R11
	MOVL %ECX, %EDI
	CALL dummy_rdmsr
	MOVQ %RAX, %RDX
	SHRQ $32, %RDX
	ADDL $0, %EAX
	POPQ %R11
	POPQ %R10
	POPQ %R9
	POPQ %R8
	POPQ %RDI
	POPQ %RSI
	POPQ %RCX
	RET
#endif

// This macro performs a forward encryption cycle. It is entered with
// the first previous round column values in I1E, I2E, I3E and I4E and
// exits with the final values OU1, OU2, OU3 and OU4 registers.
//Directly uses RDI, RSI, R8, and R13

#define fwd_rnd(p1,I1E,I1B,I1H,I2E,I2B,I2H,I3E,I3B,I3R,I4E,I4B,I4R,OU1,OU2,OU3,OU4) \
	movzbl	I1B,%edi		;\
	movzbl	I2B,%esi		;\
	movzbl	I3B,%r8d		;\
	movzbl	I4B,%r13d		;\
	shrl	$8,I3E			;\
	shrl	$8,I4E			;\
	xorl	p1(,%rdi,4),OU1		;\
	xorl	p1(,%rsi,4),OU2		;\
	xorl	p1(,%r8,4),OU3		;\
	xorl	p1(,%r13,4),OU4		;\
	movzbl	I2H,%esi		;\
	movzbl	I3B,%r8d		;\
	movzbl	I4B,%r13d		;\
	movzbl	I1H,%edi		;\
	shrl	$8,I3E			;\
	shrl	$8,I4E			;\
	xorl	p1+tlen(,%rsi,4),OU1	;\
	xorl	p1+tlen(,%r8,4),OU2	;\
	xorl	p1+tlen(,%r13,4),OU3	;\
	xorl	p1+tlen(,%rdi,4),OU4	;\
	shrl	$16,I1E			;\
	shrl	$16,I2E			;\
	movzbl	I3B,%r8d		;\
	movzbl	I4B,%r13d		;\
	movzbl	I1B,%edi		;\
	movzbl	I2B,%esi		;\
	xorl	p1+2*tlen(,%r8,4),OU1	;\
	xorl	p1+2*tlen(,%r13,4),OU2	;\
	xorl	p1+2*tlen(,%rdi,4),OU3	;\
	xorl	p1+2*tlen(,%rsi,4),OU4	;\
	shrl	$8,I4E			;\
	movzbl	I1H,%edi		;\
	movzbl	I2H,%esi		;\
	shrl	$8,I3E			;\
	xorl	p1+3*tlen(,I4R,4),OU1	;\
	xorl	p1+3*tlen(,%rdi,4),OU2	;\
	xorl	p1+3*tlen(,%rsi,4),OU3	;\
	xorl	p1+3*tlen(,I3R,4),OU4

// This macro performs an inverse encryption cycle. It is entered with
// the first previous round column values in I1E, I2E, I3E and I4E and
// exits with the final values OU1, OU2, OU3 and OU4 registers.
//Directly uses RDI, RSI, R8, and R13
	
#define inv_rnd(p1,I1E,I1B,I1R,I2E,I2B,I2R,I3E,I3B,I3H,I4E,I4B,I4H,OU1,OU2,OU3,OU4) \
	movzbl	I4B,%edi		;\
	movzbl	I3B,%esi		;\
	movzbl	I2B,%r8d		;\
	movzbl	I1B,%r13d		;\
	shrl	$8,I2E			;\
	shrl	$8,I1E			;\
	xorl	p1(,%rdi,4),OU4		;\
	xorl	p1(,%rsi,4),OU3		;\
	xorl	p1(,%r8,4),OU2		;\
	xorl	p1(,%r13,4),OU1		;\
	movzbl	I3H,%esi		;\
	movzbl	I2B,%r8d		;\
	movzbl	I1B,%r13d		;\
	movzbl	I4H,%edi		;\
	shrl	$8,I2E			;\
	shrl	$8,I1E			;\
	xorl	p1+tlen(,%rsi,4),OU4	;\
	xorl	p1+tlen(,%r8,4),OU3	;\
	xorl	p1+tlen(,%r13,4),OU2	;\
	xorl	p1+tlen(,%rdi,4),OU1	;\
	shrl	$16,I4E			;\
	shrl	$16,I3E			;\
	movzbl	I2B,%r8d		;\
	movzbl	I1B,%r13d		;\
	movzbl	I4B,%edi		;\
	movzbl	I3B,%esi		;\
	xorl	p1+2*tlen(,%r8,4),OU4	;\
	xorl	p1+2*tlen(,%r13,4),OU3	;\
	xorl	p1+2*tlen(,%rdi,4),OU2	;\
	xorl	p1+2*tlen(,%rsi,4),OU1	;\
	shrl	$8,I1E			;\
	movzbl	I4H,%edi		;\
	movzbl	I3H,%esi		;\
	shrl	$8,I2E			;\
	xorl	p1+3*tlen(,I1R,4),OU4	;\
	xorl	p1+3*tlen(,%rdi,4),OU3	;\
	xorl	p1+3*tlen(,%rsi,4),OU2	;\
	xorl	p1+3*tlen(,I2R,4),OU1

//---------------

	.bss

	.lcomm intel_or_amd, 4

	.text
	.align	ALIGN64BYTES

#define mix_col(p1, r4d, r4l, r4h, t1d, t2d, t2r)			 \
	movzbl	%r4l,%t2d		;\
	movl	p1(,%t2r,4),%t1d	;\
	movzbl	%r4h,%t2d		;\
	ror	$16,%r4d		;\
	xorl	p1+tlen(,%t2r,4),%t1d	;\
	movzbl	%r4l,%t2d		;\
	xorl	p1+2*tlen(,%t2r,4),%t1d	;\
	movzbl	%r4h,%t2d		;\
	xorl	p1+3*tlen(,%t2r,4),%t1d

// Key Schedule Macros

#define ksc4(p1, r1d, r2d, r3d, r4d, r4l, r4h, t1d, t2d, t2r)			 \
	rol	$24,%r4d		;\
	mix_col(aes_fl_tab, r4d, r4l, r4h, t1d, t2d, t2r)		;\
	ror	$8,%r4d			;\
	xorl	4*p1+aes_rcon_tab,%t1d	;\
	xorl	%t1d,%r1d		;\
	xorl	%r1d,%r2d		;\
	xorl	%r2d,%r3d		;\
	xorl	%r3d,%r4d

// This macro generates the previous forward round key given the current one
// and the round number of the PREVIOUS round.
// To use this for decryption, you still need to do the
// inverse mix column operation.
// T3D and T3B must be at least tier 2 registers.
#define inv4(P1, R1D, R1B, R2D, R2B, R3D, R3L, R3H, R4D, R4L, R4H, T1D, T1B, T2D, T2B, T2R, T3D, T3B) \
	MOVZBL %R4H, %T2D /*B1*/; \
	MOVZBL %R3H, %T3D       ; \
	XORB %T3B, %T2B		; \
	MOVL aes_fl_tab(,%T2R,4), %T1D ; \
	XORL 4*P1+aes_rcon_tab, %T1D ; \
	XORB %R1B, %T1B		; \
	ROLL $8, %T1D		; \
	; \
	ROLL $8, %R1D	/*B4*/	; \
	MOVB %R4L, %T2B		; \
	XORB %R3L, %T2B		; \
	MOVL aes_fl_tab(,%T2R,4), %T2D	; \
	XORB %R1B, %T2B		; \
	ADDL %T2D, %T1D		; \
	ROLL $8, %T1D		; \
	; \
	ROLL $8, %R1D	/*B3*/	; \
	ROLL $16, %R3D	        ; \
	ROLL $16, %R4D		; \
	MOVZBL %R4H, %T2D 	; \
	MOVZBL %R3H, %T3D	; \
	XORB %T3B, %T2B		; \
	MOVL aes_fl_tab(,%T2R,4), %T2D	; \
	XORB %R1B, %T2B		; \
	ADDL %T2D, %T1D		; \
	ROLL $8, %T1D		; \
	; \
	ROLL $8, %R1D	/*B2*/	; \
	MOVB %R4L, %T2B		; \
	XORB %R3L, %T2B		; \
	MOVL aes_fl_tab(,%T2R,4), %T2D	; \
	XORB %R1B, %T2B		; \
	ADDL %T2D, %T1D		; \
	ROLL $8, %T1D		; \
	; \
	ROLL $8, %R1D	        ; \
	ROLL $16, %R3D	        ; \
	ROLL $16, %R4D		; \
	/*Take care of the trailing three bytes*/ \
	/*Must be in this order due to dependencies.*/ \
	XORL %R3D, %R4D		; \
	XORL %R2D, %R3D		; \
	XORL %R1D, %R2D		; \
	/*Overwrite high input word with high output word, and we're done.*/ \
	MOVL %T1D, %R1D

#define backup_key(r1, r2, r3, r4, r5, r6) \
	MOVQ %r1, %r5 ;\
    MOVQ %r3, %r6 ;\
    SALQ $32, %r5 ;\
    SALQ $32, %r6 ;\
    ADDQ %r2, %r5 ;\
    ADDQ %r4, %r6

#define restore_key(r1d, r1, r2d, r2, r3d, r4d, r5d, r6d) \
  	MOVL %r1d, %r4d ;\
    MOVL %r2d, %r6d ;\
    SHRQ $32, %r1 ;\
    SHRQ $32, %r2 ;\
    MOVL %r1d, %r3d ;\
    MOVL %r2d, %r5d

#ifdef DEBUG_HARNESS
.globl test_ksc4
.globl test_inv4

//RDI,RSI,RDX,RCX,R8

test_ksc4:
	ksc4(0,EDI,ESI,EDX,ECX,CL,CH,R9D,EAX,RAX)
	MOVL %EDI, (%R8)
	MOVL %ESI, 4(%R8)
	MOVL %EDX, 8(%R8)
	MOVL %ECX, 12(%R8)
	RET

test_inv4:
	PUSHQ %RBX
	inv4(0,EDI,DIL,ESI,SIL,EDX,DL,DH,ECX,CL,CH,R9D,R9B,EAX,AL,RAX,EBX,BL)
	POPQ %RBX
	MOVL %EDI, (%R8)
	MOVL %ESI, 4(%R8)
	MOVL %EDX, 8(%R8)
	MOVL %ECX, 12(%R8)
	RET

#endif

//Called with key in EBX EDX R14D R15D
//           plaintext in EAX ECX R10D R11D
//Consumes plaintext and outputs ciphertext to EAX ECX R10D R11D
//Uses temporary registers RDI, RSI, R8, R13, R9, R12
aes_encrypt_internal:
	//Simultaneous key expansion and encryption
	//Input plaintext: EAX ECX R10D R11D
	//Input key: EBX EDX R14D R15D (input key is OU1 ... OU4)
	//Registers used by fwd_rnd: RDI, RSI, R8, and R13
	//Registers used by ksc4: (four registers of previous round key)
	//                                            (two temporary registers)

	//XOR in first round key
	XORL %EBX, %EAX
	XORL %EDX, %ECX
	XORL %R14D, %R10D
	XORL %R15D, %R11D

	XCHG %EDX, %R15D
	ksc4(0, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	XCHG %EDX, %R15D
	backup_key(RBX, RDX, R14, R15, R9, R12)
	fwd_rnd(aes_ft_tab,%eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)

	restore_key(R9D, R9, R12D, R12, EAX, R11D, R10D, ECX)
	ksc4(1, EAX, R11D, R10D, ECX, CL, CH, EDI, ESI, RSI)
	XCHG %ECX, %R11D
	backup_key(RAX, RCX, R10, R11, R9, R12)
	fwd_rnd(aes_ft_tab, %ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)

	restore_key(R9D, R9, R12D, R12, EBX, R15D, R14D, EDX)
	ksc4(2, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	XCHG %EDX, %R15D
	backup_key(RBX, RDX, R14, R15, R9, R12)
	fwd_rnd(aes_ft_tab,%eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)

	restore_key(R9D, R9, R12D, R12, EAX, R11D, R10D, ECX)
	ksc4(3, EAX, R11D, R10D, ECX, CL, CH, EDI, ESI, RSI)
	XCHG %ECX, %R11D
	backup_key(RAX, RCX, R10, R11, R9, R12)
	fwd_rnd(aes_ft_tab, %ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)

	restore_key(R9D, R9, R12D, R12, EBX, R15D, R14D, EDX)
	ksc4(4, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	XCHG %EDX, %R15D
	backup_key(RBX, RDX, R14, R15, R9, R12)
	fwd_rnd(aes_ft_tab,%eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)

	restore_key(R9D, R9, R12D, R12, EAX, R11D, R10D, ECX)
	ksc4(5, EAX, R11D, R10D, ECX, CL, CH, EDI, ESI, RSI)
	XCHG %ECX, %R11D
	backup_key(RAX, RCX, R10, R11, R9, R12)
	fwd_rnd(aes_ft_tab, %ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)

	restore_key(R9D, R9, R12D, R12, EBX, R15D, R14D, EDX)
	ksc4(6, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	XCHG %EDX, %R15D
	backup_key(RBX, RDX, R14, R15, R9, R12)
	fwd_rnd(aes_ft_tab,%eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)

	restore_key(R9D, R9, R12D, R12, EAX, R11D, R10D, ECX)
	ksc4(7, EAX, R11D, R10D, ECX, CL, CH, EDI, ESI, RSI)
	XCHG %ECX, %R11D
	backup_key(RAX, RCX, R10, R11, R9, R12)
	fwd_rnd(aes_ft_tab, %ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)

	restore_key(R9D, R9, R12D, R12, EBX, R15D, R14D, EDX)
	ksc4(8, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	XCHG %EDX, %R15D
	backup_key(RBX, RDX, R14, R15, R9, R12)
	fwd_rnd(aes_ft_tab,%eax,%al,%ah,%ecx,%cl,%ch,%r10d,%r10b,%r10,%r11d,%r11b,%r11,%ebx,%edx,%r14d,%r15d)

	restore_key(R9D, R9, R12D, R12, EAX, R11D, R10D, ECX)
	ksc4(9, EAX, R11D, R10D, ECX, CL, CH, EDI, ESI, RSI)
	XCHG %ECX, %R11D
	fwd_rnd(aes_fl_tab, %ebx,%bl,%bh,%edx,%dl,%dh,%r14d,%r14b,%r14,%r15d,%r15b,%r15,%eax,%ecx,%r10d,%r11d)

	RET

//Called with key in EBX EDX R14D R15D
//           ciphertext in EAX ECX R10D R11D
//Consumes ciphertext and outputs plaintext to EAX ECX R10D R11D
//Uses temporary registers RDI, RSI, R8, R13, R9, R12
aes_decrypt_internal:
	//Simultaneous key expansion and encryption
	//Input ciphertext: EAX ECX R10D R11D
	//Input key: EBX EDX R14D R15D (input key is OU1 ... OU4)
	//Registers used by inv_rnd: RDI, RSI, R8, and R13
	//Registers used by inv4: (four registers of previous round key)
	//                                            (two temporary registers)

	//Swap input registers
	XCHG %R11D, %EAX
	XCHG %R10D, %ECX
	XCHG %EBX, %R15D
	XCHG %EDX, %R14D

	//XOR in first round key
	XORL %R15D, %R11D
	XORL %R14D, %R10D
	XORL %EDX, %ECX
	XORL %EBX, %EAX

	inv4(9,R15D,R15B,R14D,R14B,EDX,DL,DH,EBX,BL,BH,R8D,R8B,ESI,SIL,RSI,EDI,DIL)
	backup_key(R15, R14, RDX, RBX, R9, R12)
	XCHG %R15D, %EAX
	XCHG %R14D, %ECX
	mix_col(aes_im_tab,EAX,AL,AH,EDI,ESI,RSI)
	MOVL %EDI, %EAX
	mix_col(aes_im_tab,EBX,BL,BH,EDI,ESI,RSI)
	MOVL %EDI, %EBX
	mix_col(aes_im_tab,ECX,CL,CH,EDI,ESI,RSI)
	MOVL %EDI, %ECX
	mix_col(aes_im_tab,EDX,DL,DH,EDI,ESI,RSI)
	MOVL %EDI, %EDX
	XCHG %R15D, %EAX
	XCHG %R14D, %ECX
	inv_rnd(aes_it_tab,%r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)

	restore_key(R9D,R9,R12D,R12,R11D,R10D,ECX,EAX)
	inv4(8,R11D,R11B,R10D,R10B,ECX,CL,CH,EAX,AL,AH,R8D,R8B,ESI,SIL,RSI,EDI,DIL)
	backup_key(R11,R10,RCX,RAX,R9,R12)
	XCHG %R11D, %EBX
	XCHG %R10D, %EDX
	mix_col(aes_im_tab,EAX,AL,AH,EDI,ESI,RSI)
	MOVL %EDI, %EAX
	mix_col(aes_im_tab,EBX,BL,BH,EDI,ESI,RSI)
	MOVL %EDI, %EBX
	mix_col(aes_im_tab,ECX,CL,CH,EDI,ESI,RSI)
	MOVL %EDI, %ECX
	mix_col(aes_im_tab,EDX,DL,DH,EDI,ESI,RSI)
	MOVL %EDI, %EDX
	XCHG %R11D, %EBX
	XCHG %R10D, %EDX
	inv_rnd(aes_it_tab,%r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)

	restore_key(R9D,R9,R12D,R12,R15D,R14D,EDX,EBX)
	inv4(7,R15D,R15B,R14D,R14B,EDX,DL,DH,EBX,BL,BH,R8D,R8B,ESI,SIL,RSI,EDI,DIL)
	backup_key(R15, R14, RDX, RBX, R9, R12)
	XCHG %R15D, %EAX
	XCHG %R14D, %ECX
	mix_col(aes_im_tab,EAX,AL,AH,EDI,ESI,RSI)
	MOVL %EDI, %EAX
	mix_col(aes_im_tab,EBX,BL,BH,EDI,ESI,RSI)
	MOVL %EDI, %EBX
	mix_col(aes_im_tab,ECX,CL,CH,EDI,ESI,RSI)
	MOVL %EDI, %ECX
	mix_col(aes_im_tab,EDX,DL,DH,EDI,ESI,RSI)
	MOVL %EDI, %EDX
	XCHG %R15D, %EAX
	XCHG %R14D, %ECX
	inv_rnd(aes_it_tab,%r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)

	restore_key(R9D,R9,R12D,R12,R11D,R10D,ECX,EAX)
	inv4(6,R11D,R11B,R10D,R10B,ECX,CL,CH,EAX,AL,AH,R8D,R8B,ESI,SIL,RSI,EDI,DIL)
	backup_key(R11,R10,RCX,RAX,R9,R12)
	XCHG %R11D, %EBX
	XCHG %R10D, %EDX
	mix_col(aes_im_tab,EAX,AL,AH,EDI,ESI,RSI)
	MOVL %EDI, %EAX
	mix_col(aes_im_tab,EBX,BL,BH,EDI,ESI,RSI)
	MOVL %EDI, %EBX
	mix_col(aes_im_tab,ECX,CL,CH,EDI,ESI,RSI)
	MOVL %EDI, %ECX
	mix_col(aes_im_tab,EDX,DL,DH,EDI,ESI,RSI)
	MOVL %EDI, %EDX
	XCHG %R11D, %EBX
	XCHG %R10D, %EDX
	inv_rnd(aes_it_tab,%r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)

	restore_key(R9D,R9,R12D,R12,R15D,R14D,EDX,EBX)
	inv4(5,R15D,R15B,R14D,R14B,EDX,DL,DH,EBX,BL,BH,R8D,R8B,ESI,SIL,RSI,EDI,DIL)
	backup_key(R15, R14, RDX, RBX, R9, R12)
	XCHG %R15D, %EAX
	XCHG %R14D, %ECX
	mix_col(aes_im_tab,EAX,AL,AH,EDI,ESI,RSI)
	MOVL %EDI, %EAX
	mix_col(aes_im_tab,EBX,BL,BH,EDI,ESI,RSI)
	MOVL %EDI, %EBX
	mix_col(aes_im_tab,ECX,CL,CH,EDI,ESI,RSI)
	MOVL %EDI, %ECX
	mix_col(aes_im_tab,EDX,DL,DH,EDI,ESI,RSI)
	MOVL %EDI, %EDX
	XCHG %R15D, %EAX
	XCHG %R14D, %ECX
	inv_rnd(aes_it_tab,%r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)

	restore_key(R9D,R9,R12D,R12,R11D,R10D,ECX,EAX)
	inv4(4,R11D,R11B,R10D,R10B,ECX,CL,CH,EAX,AL,AH,R8D,R8B,ESI,SIL,RSI,EDI,DIL)
	backup_key(R11,R10,RCX,RAX,R9,R12)
	XCHG %R11D, %EBX
	XCHG %R10D, %EDX
	mix_col(aes_im_tab,EAX,AL,AH,EDI,ESI,RSI)
	MOVL %EDI, %EAX
	mix_col(aes_im_tab,EBX,BL,BH,EDI,ESI,RSI)
	MOVL %EDI, %EBX
	mix_col(aes_im_tab,ECX,CL,CH,EDI,ESI,RSI)
	MOVL %EDI, %ECX
	mix_col(aes_im_tab,EDX,DL,DH,EDI,ESI,RSI)
	MOVL %EDI, %EDX
	XCHG %R11D, %EBX
	XCHG %R10D, %EDX
	inv_rnd(aes_it_tab,%r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)

	restore_key(R9D,R9,R12D,R12,R15D,R14D,EDX,EBX)
	inv4(3,R15D,R15B,R14D,R14B,EDX,DL,DH,EBX,BL,BH,R8D,R8B,ESI,SIL,RSI,EDI,DIL)
	backup_key(R15, R14, RDX, RBX, R9, R12)
	XCHG %R15D, %EAX
	XCHG %R14D, %ECX
	mix_col(aes_im_tab,EAX,AL,AH,EDI,ESI,RSI)
	MOVL %EDI, %EAX
	mix_col(aes_im_tab,EBX,BL,BH,EDI,ESI,RSI)
	MOVL %EDI, %EBX
	mix_col(aes_im_tab,ECX,CL,CH,EDI,ESI,RSI)
	MOVL %EDI, %ECX
	mix_col(aes_im_tab,EDX,DL,DH,EDI,ESI,RSI)
	MOVL %EDI, %EDX
	XCHG %R15D, %EAX
	XCHG %R14D, %ECX
	inv_rnd(aes_it_tab,%r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)

	restore_key(R9D,R9,R12D,R12,R11D,R10D,ECX,EAX)
	inv4(2,R11D,R11B,R10D,R10B,ECX,CL,CH,EAX,AL,AH,R8D,R8B,ESI,SIL,RSI,EDI,DIL)
	backup_key(R11,R10,RCX,RAX,R9,R12)
	XCHG %R11D, %EBX
	XCHG %R10D, %EDX
	mix_col(aes_im_tab,EAX,AL,AH,EDI,ESI,RSI)
	MOVL %EDI, %EAX
	mix_col(aes_im_tab,EBX,BL,BH,EDI,ESI,RSI)
	MOVL %EDI, %EBX
	mix_col(aes_im_tab,ECX,CL,CH,EDI,ESI,RSI)
	MOVL %EDI, %ECX
	mix_col(aes_im_tab,EDX,DL,DH,EDI,ESI,RSI)
	MOVL %EDI, %EDX
	XCHG %R11D, %EBX
	XCHG %R10D, %EDX
	inv_rnd(aes_it_tab,%r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)

	restore_key(R9D,R9,R12D,R12,R15D,R14D,EDX,EBX)
	inv4(1,R15D,R15B,R14D,R14B,EDX,DL,DH,EBX,BL,BH,R8D,R8B,ESI,SIL,RSI,EDI,DIL)
	backup_key(R15, R14, RDX, RBX, R9, R12)
	XCHG %R15D, %EAX
	XCHG %R14D, %ECX
	mix_col(aes_im_tab,EAX,AL,AH,EDI,ESI,RSI)
	MOVL %EDI, %EAX
	mix_col(aes_im_tab,EBX,BL,BH,EDI,ESI,RSI)
	MOVL %EDI, %EBX
	mix_col(aes_im_tab,ECX,CL,CH,EDI,ESI,RSI)
	MOVL %EDI, %ECX
	mix_col(aes_im_tab,EDX,DL,DH,EDI,ESI,RSI)
	MOVL %EDI, %EDX
	XCHG %R15D, %EAX
	XCHG %R14D, %ECX
	inv_rnd(aes_it_tab,%r11d,%r11b,%r11,%r10d,%r10b,%r10,%ecx,%cl,%ch,%eax,%al,%ah,%r15d,%r14d,%edx,%ebx)

	restore_key(R9D,R9,R12D,R12,R11D,R10D,ECX,EAX)
	inv4(0,R11D,R11B,R10D,R10B,ECX,CL,CH,EAX,AL,AH,R8D,R8B,ESI,SIL,RSI,EDI,DIL)
	inv_rnd(aes_il_tab,%r15d,%r15b,%r15,%r14d,%r14b,%r14,%edx,%dl,%dh,%ebx,%bl,%bh,%r11d,%r10d,%ecx,%eax)

	//Swap order of ciphertext
	XCHG %R11D, %EAX
	XCHG %R10D, %ECX

	RET

aes_encrypt:
	PUSHQ %RBP
	LEAQ (aes_encrypt_internal), %RBP
	LEAQ ekey(%RDI), %RDI
	CALL aes_cipher
	POPQ %RBP
	RET

aes_decrypt:
	PUSHQ %RBP
	LEAQ (aes_decrypt_internal), %RBP
	LEAQ dkey(%RDI), %RDI
	CALL aes_cipher
	POPQ %RBP
	RET

//void aes_encrypt/aes_decrypt(const aes_context *cx, const unsigned char in_blk[], unsigned char out_blk[])
//                  rdi                    rsi                           rdx
// in_blk and out_blk are both 16-byte buffers
aes_cipher:
	//Push rFLAGS on stack
	PUSHFQ

	//Push all callee-saved registers except RBP (which was pushed by our caller and contains an argument we care about).  Optimize this later.
	PUSHQ %RBX
	PUSHQ %R12
	PUSHQ %R13
	PUSHQ %R14
	PUSHQ %R15

	//Also push RSI and RDX since they contain arguments we care about
	PUSHQ %RDX
	PUSHQ %RSI

	//Disable interrupts.
#ifndef DEBUG_HARNESS
	CLI
#endif

	//Copy the secret XOR quantity into a register.
	//We need to switch on whether we are an Intel or AMD processor.
	MOVL (intel_or_amd), %R9D
	SUBL $1, %R9D
	JE use_intel

	//We are AMD: C0010004 C0010005 C0010006 C0010007
	MOVL $0xC0010004, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %EBX
	MOVL $0xC0010005, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R12D
	MOVL $0xC0010006, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R14D
	MOVL $0xC0010007, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R15D
	
	JMP master_copied

use_intel:
	//We are Intel: C1 C2 309 30A
	MOVL $0xC1, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %EBX
	MOVL $0xC2, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R12D
	MOVL $0x309, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R14D
	MOVL $0x30A, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R15D

master_copied:
	//Shuffle master key into proper registers
	MOVL %R12D, %EDX

	//Load encrypted key value
	MOVL (%RDI), %EAX
	MOVL 4(%RDI), %ECX
	MOVL 8(%RDI), %R10D
	MOVL 12(%RDI), %R11D

	//Decrypt key (use encryption function because it's faster)
	CALL aes_encrypt_internal

	//Get our input plaintext pointer back off the stack
	POPQ %RSI

	//Shuffle decrypted key into appropriate registers
	MOVL %EAX, %EBX
	MOVL %ECX, %EDX
	MOVL %R10D, %R14D
	MOVL %R11D, %R15D

	//Read in plaintext
	MOVL (%RSI), %EAX
	MOVL 4(%RSI), %ECX
	MOVL 8(%RSI), %R10D
	MOVL 12(%RSI), %R11D

	//Do actual encryption or decryption
	CALL %RBP

	//Get our output ciphertext pointer back off the stack
	POPQ %RDX

	//Output ciphertext
	MOVL %EAX, (%RDX)
	MOVL %ECX, 4(%RDX)
	MOVL %R10D, 8(%RDX)
	MOVL %R11D, 12(%RDX)

	//Pop callee-save registers
	POPQ %R15
	POPQ %R14
	POPQ %R13
	POPQ %R12
	POPQ %RBX

	//Zero callee-save registers!!!  We MUST do this before enabling interrupts,
	//or we risk leaking information!
	//Optimize this later.
	XORQ %RAX, %RAX
	XORQ %RCX, %RCX
	XORQ %RDX, %RDX
	XORQ %R8, %R8
	XORQ %R9, %R9
	XORQ %R10, %R10
	XORQ %R11, %R11
	XORQ %RDI, %RDI
	XORQ %RSI, %RSI

	//Pop the flags, which will (probably) reenable interrupts
	POPFQ

	//Return to caller
	RET


.extern get_random_bytes

// rdi = pointer to AES context
// rsi = pointer to key bytes
// rdx = key length, bytes or bits
// rcx = ed_flag, 1=encrypt only, 0=both encrypt and decrypt

	.align	ALIGN64BYTES
aes_set_key:

	//Push flags, disable interrupts
	PUSHFQ
#ifndef DEBUG_HARNESS
	CLI
#endif

	//Key-setting routine here
	//Return value of a function call stored in %RAX

	//Initial register pushes
	XORL %EDX, %EDX
	PUSHQ %RBX
	PUSHQ %R12
	PUSHQ %R13
	PUSHQ %R14
	PUSHQ %R15
	PUSHQ %RDI
	PUSHQ %RSI

	//See if we are the very first keyset method called
	MOVL (intel_or_amd), %EAX
	CMPL $0, %EAX
	JNE key_already_created

    //Handle intel_or_amd first; we need to get the CPU ID string
	XORL %EAX, %EAX
	CPUID
	RORL $8, %EBX
	SHRL $24, %EBX
	CMPL $0x47, %EBX
    JE record_intel
    MOVL $2, intel_or_amd
    JMP smp_call

record_intel:
    MOVL $1, intel_or_amd

smp_call:
    //Need to call to C to handle thread crap
    CALL smp_master_key
	MOVL (intel_or_amd), %EAX

key_already_created:
	//Determine whether we are Intel or AMD
	//Key in registers: EBX EDX R14D R15D
	SUBL $1, %EAX
	JE load_intel

	//We are AMD: C0010004 C0010005 C0010006 C0010007
	MOVL $0xC0010004, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %EBX

	MOVL $0xC0010005, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R8D

	MOVL $0xC0010006, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R14D

	MOVL $0xC0010007, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R15D

	MOVL %R8D, %EDX
	JMP set_key_vendor_independent

load_intel:
	//We are Intel: C1 C2 309 30A
	MOVL $0xC1, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %EBX

	MOVL $0xC2, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R8D

	MOVL $0x309, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R14D

	MOVL $0x30A, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_rdmsr
#else
	RDMSR
#endif
	MOVL %EAX, %R15D

	MOVL %R8D, %EDX
	JMP set_key_vendor_independent

.extern smp_master_key

//Parameter: pointer to master key in %RDI
.globl smp_asm_set_master_key
smp_asm_set_master_key:
    MOVL (intel_or_amd), %EAX
    SUBL $1, %EAX
	JE set_intel

	//We are AMD: C0010004 C0010005 C0010006 C0010007
	XORL %EDX, %EDX
	MOVL (%RDI), %EAX
	MOVL $0xC0010004, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_wrmsr
#else
	WRMSR
#endif
	XORL %EDX, %EDX
	MOVL 4(%RDI), %EAX
	MOVL $0xC0010005, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_wrmsr
#else
	WRMSR
#endif
	XORL %EDX, %EDX
	MOVL 8(%RDI), %EAX
	MOVL $0xC0010006, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_wrmsr
#else
	WRMSR
#endif
	XORL %EDX, %EDX
	MOVL 12(%RDI), %EAX
	MOVL $0xC0010007, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_wrmsr
#else
	WRMSR
#endif

    XORQ %RAX, %RAX
    RET

set_intel:
	//We are Intel: C1 C2 309 30A
	XORL %EDX, %EDX
	MOVL (%RDI), %EAX
	MOVL $0xC1, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_wrmsr
#else
	WRMSR
#endif
	XORL %EDX, %EDX
	MOVL 4(%RDI), %EAX
	MOVL $0xC2, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_wrmsr
#else
	WRMSR
#endif
	XORL %EDX, %EDX
	MOVL 8(%RDI), %EAX
	MOVL $0x309, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_wrmsr
#else
	WRMSR
#endif
	XORL %EDX, %EDX
	MOVL 12(%RDI), %EAX
	MOVL $0x30A, %ECX
#ifdef DEBUG_HARNESS
	CALL harness_wrmsr
#else
	WRMSR
#endif

    XORQ %RAX, %RAX
    RET


set_key_vendor_independent:
	//We now need to "decrypt" the context keys.
	//Run the master key through ksc4
	XCHG %EDX, %R15D
	ksc4(0, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	ksc4(1, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	ksc4(2, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	ksc4(3, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	ksc4(4, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	ksc4(5, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	ksc4(6, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	ksc4(7, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	ksc4(8, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	ksc4(9, EBX, R15D, R14D, EDX, DL, DH, EDI, ESI, RSI)
	XCHG %EDX, %R15D

	//Back the 10th round master key up
	backup_key(RBX, RDX, R14, R15, R9, R12)
	MOVQ %R9, %MM0
	MOVQ %R12, %MM1

	//Read zero-round context key into appropriate registers
	POPQ %RSI
	MOVL (%RSI), %EAX
	MOVL 4(%RSI), %ECX
	MOVL 8(%RSI), %R10D
	MOVL 12(%RSI), %R11D

	//Call decryption routine
	PUSHQ %RSI
	CALL aes_decrypt_internal
	POPQ %RSI

	//Re-read zero-round context key into available registers
	MOVL (%RSI), %EBX
	MOVL 4(%RSI), %R15D
	MOVL 8(%RSI), %R14D
	MOVL 12(%RSI), %EDX

	//Write encrypted zero-round context key
	POPQ %RDI
	LEAQ ekey(%RDI), %RSI
	MOVL %EAX, (%RSI)
	MOVL %ECX, 4(%RSI)
	MOVL %R10D, 8(%RSI)
	MOVL %R11D, 12(%RSI)

	//Calculate pointer to 10th-round context key, push to stack
	LEAQ dkey(%RDI), %RSI
	PUSHQ %RSI

	//Calculate 10th-round context key
	ksc4(0,EBX,R15D,R14D,EDX,DL,DH,EAX,ECX,RCX)
	ksc4(1,EBX,R15D,R14D,EDX,DL,DH,EAX,ECX,RCX)
	ksc4(2,EBX,R15D,R14D,EDX,DL,DH,EAX,ECX,RCX)
	ksc4(3,EBX,R15D,R14D,EDX,DL,DH,EAX,ECX,RCX)
	ksc4(4,EBX,R15D,R14D,EDX,DL,DH,EAX,ECX,RCX)
	ksc4(5,EBX,R15D,R14D,EDX,DL,DH,EAX,ECX,RCX)
	ksc4(6,EBX,R15D,R14D,EDX,DL,DH,EAX,ECX,RCX)
	ksc4(7,EBX,R15D,R14D,EDX,DL,DH,EAX,ECX,RCX)
	ksc4(8,EBX,R15D,R14D,EDX,DL,DH,EAX,ECX,RCX)
	ksc4(9,EBX,R15D,R14D,EDX,DL,DH,EAX,ECX,RCX)

	//Move 10th-round context key into appropriate registers
	MOVL %EBX, %EAX
	MOVL %R15D, %ECX
	MOVL %R14D, %R10D
	MOVL %EDX, %R11D

	//Restore 10th round master key
	MOVQ %MM0, %R9
	MOVQ %MM1, %R12
	XORQ %RBX, %RBX
	MOVQ %RBX, %MM0 //kill the master key from MMX regs
	MOVQ %RBX, %MM1 //kill the master key from MMx regs
	restore_key(R9D,R9,R12D,R12,EBX,EDX,R14D,R15D)

	//Call encryption function
	CALL aes_decrypt_internal

	//Restore dkey pointer
	POPQ %RSI

	MOVL %EAX, (%RSI)
	MOVL %ECX, 4(%RSI)
	MOVL %R10D, 8(%RSI)
	MOVL %R11D, 12(%RSI)

	//Restore callee-save registers
	POPQ %R15
	POPQ %R14
	POPQ %R13
	POPQ %R12
	POPQ %RBX
    POPFQ
	RET

// finite field multiplies by {02}, {04} and {08}

#define f2(x)	((x<<1)^(((x>>7)&1)*0x11b))
#define f4(x)	((x<<2)^(((x>>6)&1)*0x11b)^(((x>>6)&2)*0x11b))
#define f8(x)	((x<<3)^(((x>>5)&1)*0x11b)^(((x>>5)&2)*0x11b)^(((x>>5)&4)*0x11b))

// finite field multiplies required in table generation

#define f3(x)	(f2(x) ^ x)
#define f9(x)	(f8(x) ^ x)
#define fb(x)	(f8(x) ^ f2(x) ^ x)
#define fd(x)	(f8(x) ^ f4(x) ^ x)
#define fe(x)	(f8(x) ^ f4(x) ^ f2(x))

// These defines generate the forward table entries

#define u0(x)	((f3(x) << 24) | (x << 16) | (x << 8) | f2(x))
#define u1(x)	((x << 24) | (x << 16) | (f2(x) << 8) | f3(x))
#define u2(x)	((x << 24) | (f2(x) << 16) | (f3(x) << 8) | x)
#define u3(x)	((f2(x) << 24) | (f3(x) << 16) | (x << 8) | x)

// These defines generate the inverse table entries

#define v0(x)	((fb(x) << 24) | (fd(x) << 16) | (f9(x) << 8) | fe(x))
#define v1(x)	((fd(x) << 24) | (f9(x) << 16) | (fe(x) << 8) | fb(x))
#define v2(x)	((f9(x) << 24) | (fe(x) << 16) | (fb(x) << 8) | fd(x))
#define v3(x)	((fe(x) << 24) | (fb(x) << 16) | (fd(x) << 8) | f9(x))

// These defines generate entries for the last round tables

#define w0(x)	(x)
#define w1(x)	(x <<  8)
#define w2(x)	(x << 16)
#define w3(x)	(x << 24)

// macro to generate inverse mix column tables (needed for the key schedule)

#define im_data0(p1) \
	.long	p1(0x00),p1(0x01),p1(0x02),p1(0x03),p1(0x04),p1(0x05),p1(0x06),p1(0x07) ;\
	.long	p1(0x08),p1(0x09),p1(0x0a),p1(0x0b),p1(0x0c),p1(0x0d),p1(0x0e),p1(0x0f) ;\
	.long	p1(0x10),p1(0x11),p1(0x12),p1(0x13),p1(0x14),p1(0x15),p1(0x16),p1(0x17) ;\
	.long	p1(0x18),p1(0x19),p1(0x1a),p1(0x1b),p1(0x1c),p1(0x1d),p1(0x1e),p1(0x1f)
#define im_data1(p1) \
	.long	p1(0x20),p1(0x21),p1(0x22),p1(0x23),p1(0x24),p1(0x25),p1(0x26),p1(0x27) ;\
	.long	p1(0x28),p1(0x29),p1(0x2a),p1(0x2b),p1(0x2c),p1(0x2d),p1(0x2e),p1(0x2f) ;\
	.long	p1(0x30),p1(0x31),p1(0x32),p1(0x33),p1(0x34),p1(0x35),p1(0x36),p1(0x37) ;\
	.long	p1(0x38),p1(0x39),p1(0x3a),p1(0x3b),p1(0x3c),p1(0x3d),p1(0x3e),p1(0x3f)
#define im_data2(p1) \
	.long	p1(0x40),p1(0x41),p1(0x42),p1(0x43),p1(0x44),p1(0x45),p1(0x46),p1(0x47) ;\
	.long	p1(0x48),p1(0x49),p1(0x4a),p1(0x4b),p1(0x4c),p1(0x4d),p1(0x4e),p1(0x4f) ;\
	.long	p1(0x50),p1(0x51),p1(0x52),p1(0x53),p1(0x54),p1(0x55),p1(0x56),p1(0x57) ;\
	.long	p1(0x58),p1(0x59),p1(0x5a),p1(0x5b),p1(0x5c),p1(0x5d),p1(0x5e),p1(0x5f)
#define im_data3(p1) \
	.long	p1(0x60),p1(0x61),p1(0x62),p1(0x63),p1(0x64),p1(0x65),p1(0x66),p1(0x67) ;\
	.long	p1(0x68),p1(0x69),p1(0x6a),p1(0x6b),p1(0x6c),p1(0x6d),p1(0x6e),p1(0x6f) ;\
	.long	p1(0x70),p1(0x71),p1(0x72),p1(0x73),p1(0x74),p1(0x75),p1(0x76),p1(0x77) ;\
	.long	p1(0x78),p1(0x79),p1(0x7a),p1(0x7b),p1(0x7c),p1(0x7d),p1(0x7e),p1(0x7f)
#define im_data4(p1) \
	.long	p1(0x80),p1(0x81),p1(0x82),p1(0x83),p1(0x84),p1(0x85),p1(0x86),p1(0x87) ;\
	.long	p1(0x88),p1(0x89),p1(0x8a),p1(0x8b),p1(0x8c),p1(0x8d),p1(0x8e),p1(0x8f) ;\
	.long	p1(0x90),p1(0x91),p1(0x92),p1(0x93),p1(0x94),p1(0x95),p1(0x96),p1(0x97) ;\
	.long	p1(0x98),p1(0x99),p1(0x9a),p1(0x9b),p1(0x9c),p1(0x9d),p1(0x9e),p1(0x9f)
#define im_data5(p1) \
	.long	p1(0xa0),p1(0xa1),p1(0xa2),p1(0xa3),p1(0xa4),p1(0xa5),p1(0xa6),p1(0xa7) ;\
	.long	p1(0xa8),p1(0xa9),p1(0xaa),p1(0xab),p1(0xac),p1(0xad),p1(0xae),p1(0xaf) ;\
	.long	p1(0xb0),p1(0xb1),p1(0xb2),p1(0xb3),p1(0xb4),p1(0xb5),p1(0xb6),p1(0xb7) ;\
	.long	p1(0xb8),p1(0xb9),p1(0xba),p1(0xbb),p1(0xbc),p1(0xbd),p1(0xbe),p1(0xbf)
#define im_data6(p1) \
	.long	p1(0xc0),p1(0xc1),p1(0xc2),p1(0xc3),p1(0xc4),p1(0xc5),p1(0xc6),p1(0xc7) ;\
	.long	p1(0xc8),p1(0xc9),p1(0xca),p1(0xcb),p1(0xcc),p1(0xcd),p1(0xce),p1(0xcf) ;\
	.long	p1(0xd0),p1(0xd1),p1(0xd2),p1(0xd3),p1(0xd4),p1(0xd5),p1(0xd6),p1(0xd7) ;\
	.long	p1(0xd8),p1(0xd9),p1(0xda),p1(0xdb),p1(0xdc),p1(0xdd),p1(0xde),p1(0xdf)
#define im_data7(p1) \
	.long	p1(0xe0),p1(0xe1),p1(0xe2),p1(0xe3),p1(0xe4),p1(0xe5),p1(0xe6),p1(0xe7) ;\
	.long	p1(0xe8),p1(0xe9),p1(0xea),p1(0xeb),p1(0xec),p1(0xed),p1(0xee),p1(0xef) ;\
	.long	p1(0xf0),p1(0xf1),p1(0xf2),p1(0xf3),p1(0xf4),p1(0xf5),p1(0xf6),p1(0xf7) ;\
	.long	p1(0xf8),p1(0xf9),p1(0xfa),p1(0xfb),p1(0xfc),p1(0xfd),p1(0xfe),p1(0xff)

// S-box data - 256 entries

#define sb_data0(p1) \
	.long	p1(0x63),p1(0x7c),p1(0x77),p1(0x7b),p1(0xf2),p1(0x6b),p1(0x6f),p1(0xc5) ;\
	.long	p1(0x30),p1(0x01),p1(0x67),p1(0x2b),p1(0xfe),p1(0xd7),p1(0xab),p1(0x76) ;\
	.long	p1(0xca),p1(0x82),p1(0xc9),p1(0x7d),p1(0xfa),p1(0x59),p1(0x47),p1(0xf0) ;\
	.long	p1(0xad),p1(0xd4),p1(0xa2),p1(0xaf),p1(0x9c),p1(0xa4),p1(0x72),p1(0xc0)
#define sb_data1(p1) \
	.long	p1(0xb7),p1(0xfd),p1(0x93),p1(0x26),p1(0x36),p1(0x3f),p1(0xf7),p1(0xcc) ;\
	.long	p1(0x34),p1(0xa5),p1(0xe5),p1(0xf1),p1(0x71),p1(0xd8),p1(0x31),p1(0x15) ;\
	.long	p1(0x04),p1(0xc7),p1(0x23),p1(0xc3),p1(0x18),p1(0x96),p1(0x05),p1(0x9a) ;\
	.long	p1(0x07),p1(0x12),p1(0x80),p1(0xe2),p1(0xeb),p1(0x27),p1(0xb2),p1(0x75)
#define sb_data2(p1) \
	.long	p1(0x09),p1(0x83),p1(0x2c),p1(0x1a),p1(0x1b),p1(0x6e),p1(0x5a),p1(0xa0) ;\
	.long	p1(0x52),p1(0x3b),p1(0xd6),p1(0xb3),p1(0x29),p1(0xe3),p1(0x2f),p1(0x84) ;\
	.long	p1(0x53),p1(0xd1),p1(0x00),p1(0xed),p1(0x20),p1(0xfc),p1(0xb1),p1(0x5b) ;\
	.long	p1(0x6a),p1(0xcb),p1(0xbe),p1(0x39),p1(0x4a),p1(0x4c),p1(0x58),p1(0xcf)
#define sb_data3(p1) \
	.long	p1(0xd0),p1(0xef),p1(0xaa),p1(0xfb),p1(0x43),p1(0x4d),p1(0x33),p1(0x85) ;\
	.long	p1(0x45),p1(0xf9),p1(0x02),p1(0x7f),p1(0x50),p1(0x3c),p1(0x9f),p1(0xa8) ;\
	.long	p1(0x51),p1(0xa3),p1(0x40),p1(0x8f),p1(0x92),p1(0x9d),p1(0x38),p1(0xf5) ;\
	.long	p1(0xbc),p1(0xb6),p1(0xda),p1(0x21),p1(0x10),p1(0xff),p1(0xf3),p1(0xd2)
#define sb_data4(p1) \
	.long	p1(0xcd),p1(0x0c),p1(0x13),p1(0xec),p1(0x5f),p1(0x97),p1(0x44),p1(0x17) ;\
	.long	p1(0xc4),p1(0xa7),p1(0x7e),p1(0x3d),p1(0x64),p1(0x5d),p1(0x19),p1(0x73) ;\
	.long	p1(0x60),p1(0x81),p1(0x4f),p1(0xdc),p1(0x22),p1(0x2a),p1(0x90),p1(0x88) ;\
	.long	p1(0x46),p1(0xee),p1(0xb8),p1(0x14),p1(0xde),p1(0x5e),p1(0x0b),p1(0xdb)
#define sb_data5(p1) \
	.long	p1(0xe0),p1(0x32),p1(0x3a),p1(0x0a),p1(0x49),p1(0x06),p1(0x24),p1(0x5c) ;\
	.long	p1(0xc2),p1(0xd3),p1(0xac),p1(0x62),p1(0x91),p1(0x95),p1(0xe4),p1(0x79) ;\
	.long	p1(0xe7),p1(0xc8),p1(0x37),p1(0x6d),p1(0x8d),p1(0xd5),p1(0x4e),p1(0xa9) ;\
	.long	p1(0x6c),p1(0x56),p1(0xf4),p1(0xea),p1(0x65),p1(0x7a),p1(0xae),p1(0x08)
#define sb_data6(p1) \
	.long	p1(0xba),p1(0x78),p1(0x25),p1(0x2e),p1(0x1c),p1(0xa6),p1(0xb4),p1(0xc6) ;\
	.long	p1(0xe8),p1(0xdd),p1(0x74),p1(0x1f),p1(0x4b),p1(0xbd),p1(0x8b),p1(0x8a) ;\
	.long	p1(0x70),p1(0x3e),p1(0xb5),p1(0x66),p1(0x48),p1(0x03),p1(0xf6),p1(0x0e) ;\
	.long	p1(0x61),p1(0x35),p1(0x57),p1(0xb9),p1(0x86),p1(0xc1),p1(0x1d),p1(0x9e)
#define sb_data7(p1) \
	.long	p1(0xe1),p1(0xf8),p1(0x98),p1(0x11),p1(0x69),p1(0xd9),p1(0x8e),p1(0x94) ;\
	.long	p1(0x9b),p1(0x1e),p1(0x87),p1(0xe9),p1(0xce),p1(0x55),p1(0x28),p1(0xdf) ;\
	.long	p1(0x8c),p1(0xa1),p1(0x89),p1(0x0d),p1(0xbf),p1(0xe6),p1(0x42),p1(0x68) ;\
	.long	p1(0x41),p1(0x99),p1(0x2d),p1(0x0f),p1(0xb0),p1(0x54),p1(0xbb),p1(0x16)

// Inverse S-box data - 256 entries

#define ib_data0(p1) \
	.long	p1(0x52),p1(0x09),p1(0x6a),p1(0xd5),p1(0x30),p1(0x36),p1(0xa5),p1(0x38) ;\
	.long	p1(0xbf),p1(0x40),p1(0xa3),p1(0x9e),p1(0x81),p1(0xf3),p1(0xd7),p1(0xfb) ;\
	.long	p1(0x7c),p1(0xe3),p1(0x39),p1(0x82),p1(0x9b),p1(0x2f),p1(0xff),p1(0x87) ;\
	.long	p1(0x34),p1(0x8e),p1(0x43),p1(0x44),p1(0xc4),p1(0xde),p1(0xe9),p1(0xcb)
#define ib_data1(p1) \
	.long	p1(0x54),p1(0x7b),p1(0x94),p1(0x32),p1(0xa6),p1(0xc2),p1(0x23),p1(0x3d) ;\
	.long	p1(0xee),p1(0x4c),p1(0x95),p1(0x0b),p1(0x42),p1(0xfa),p1(0xc3),p1(0x4e) ;\
	.long	p1(0x08),p1(0x2e),p1(0xa1),p1(0x66),p1(0x28),p1(0xd9),p1(0x24),p1(0xb2) ;\
	.long	p1(0x76),p1(0x5b),p1(0xa2),p1(0x49),p1(0x6d),p1(0x8b),p1(0xd1),p1(0x25)
#define ib_data2(p1) \
	.long	p1(0x72),p1(0xf8),p1(0xf6),p1(0x64),p1(0x86),p1(0x68),p1(0x98),p1(0x16) ;\
	.long	p1(0xd4),p1(0xa4),p1(0x5c),p1(0xcc),p1(0x5d),p1(0x65),p1(0xb6),p1(0x92) ;\
	.long	p1(0x6c),p1(0x70),p1(0x48),p1(0x50),p1(0xfd),p1(0xed),p1(0xb9),p1(0xda) ;\
	.long	p1(0x5e),p1(0x15),p1(0x46),p1(0x57),p1(0xa7),p1(0x8d),p1(0x9d),p1(0x84)
#define ib_data3(p1) \
	.long	p1(0x90),p1(0xd8),p1(0xab),p1(0x00),p1(0x8c),p1(0xbc),p1(0xd3),p1(0x0a) ;\
	.long	p1(0xf7),p1(0xe4),p1(0x58),p1(0x05),p1(0xb8),p1(0xb3),p1(0x45),p1(0x06) ;\
	.long	p1(0xd0),p1(0x2c),p1(0x1e),p1(0x8f),p1(0xca),p1(0x3f),p1(0x0f),p1(0x02) ;\
	.long	p1(0xc1),p1(0xaf),p1(0xbd),p1(0x03),p1(0x01),p1(0x13),p1(0x8a),p1(0x6b)
#define ib_data4(p1) \
	.long	p1(0x3a),p1(0x91),p1(0x11),p1(0x41),p1(0x4f),p1(0x67),p1(0xdc),p1(0xea) ;\
	.long	p1(0x97),p1(0xf2),p1(0xcf),p1(0xce),p1(0xf0),p1(0xb4),p1(0xe6),p1(0x73) ;\
	.long	p1(0x96),p1(0xac),p1(0x74),p1(0x22),p1(0xe7),p1(0xad),p1(0x35),p1(0x85) ;\
	.long	p1(0xe2),p1(0xf9),p1(0x37),p1(0xe8),p1(0x1c),p1(0x75),p1(0xdf),p1(0x6e)
#define ib_data5(p1) \
	.long	p1(0x47),p1(0xf1),p1(0x1a),p1(0x71),p1(0x1d),p1(0x29),p1(0xc5),p1(0x89) ;\
	.long	p1(0x6f),p1(0xb7),p1(0x62),p1(0x0e),p1(0xaa),p1(0x18),p1(0xbe),p1(0x1b) ;\
	.long	p1(0xfc),p1(0x56),p1(0x3e),p1(0x4b),p1(0xc6),p1(0xd2),p1(0x79),p1(0x20) ;\
	.long	p1(0x9a),p1(0xdb),p1(0xc0),p1(0xfe),p1(0x78),p1(0xcd),p1(0x5a),p1(0xf4)
#define ib_data6(p1) \
	.long	p1(0x1f),p1(0xdd),p1(0xa8),p1(0x33),p1(0x88),p1(0x07),p1(0xc7),p1(0x31) ;\
	.long	p1(0xb1),p1(0x12),p1(0x10),p1(0x59),p1(0x27),p1(0x80),p1(0xec),p1(0x5f) ;\
	.long	p1(0x60),p1(0x51),p1(0x7f),p1(0xa9),p1(0x19),p1(0xb5),p1(0x4a),p1(0x0d) ;\
	.long	p1(0x2d),p1(0xe5),p1(0x7a),p1(0x9f),p1(0x93),p1(0xc9),p1(0x9c),p1(0xef)
#define ib_data7(p1) \
	.long	p1(0xa0),p1(0xe0),p1(0x3b),p1(0x4d),p1(0xae),p1(0x2a),p1(0xf5),p1(0xb0) ;\
	.long	p1(0xc8),p1(0xeb),p1(0xbb),p1(0x3c),p1(0x83),p1(0x53),p1(0x99),p1(0x61) ;\
	.long	p1(0x17),p1(0x2b),p1(0x04),p1(0x7e),p1(0xba),p1(0x77),p1(0xd6),p1(0x26) ;\
	.long	p1(0xe1),p1(0x69),p1(0x14),p1(0x63),p1(0x55),p1(0x21),p1(0x0c),p1(0x7d)

// The rcon_table (needed for the key schedule)
//
// Here is original Dr Brian Gladman's source code:
//	_rcon_tab:
//	%assign x   1
//	%rep 29
//	    dd  x
//	%assign x f2(x)
//	%endrep
//
// Here is precomputed output (it's more portable this way):

	.section .rodata
	.align	ALIGN64BYTES
aes_rcon_tab:
	.long	0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
	.long	0x1b,0x36,0x6c,0xd8,0xab,0x4d,0x9a,0x2f
	.long	0x5e,0xbc,0x63,0xc6,0x97,0x35,0x6a,0xd4
	.long	0xb3,0x7d,0xfa,0xef,0xc5

// The forward xor tables

	.align	ALIGN64BYTES
aes_ft_tab:
	sb_data0(u0)
	sb_data1(u0)
	sb_data2(u0)
	sb_data3(u0)
	sb_data4(u0)
	sb_data5(u0)
	sb_data6(u0)
	sb_data7(u0)

	sb_data0(u1)
	sb_data1(u1)
	sb_data2(u1)
	sb_data3(u1)
	sb_data4(u1)
	sb_data5(u1)
	sb_data6(u1)
	sb_data7(u1)

	sb_data0(u2)
	sb_data1(u2)
	sb_data2(u2)
	sb_data3(u2)
	sb_data4(u2)
	sb_data5(u2)
	sb_data6(u2)
	sb_data7(u2)

	sb_data0(u3)
	sb_data1(u3)
	sb_data2(u3)
	sb_data3(u3)
	sb_data4(u3)
	sb_data5(u3)
	sb_data6(u3)
	sb_data7(u3)

	.align	ALIGN64BYTES
aes_fl_tab:
	sb_data0(w0)
	sb_data1(w0)
	sb_data2(w0)
	sb_data3(w0)
	sb_data4(w0)
	sb_data5(w0)
	sb_data6(w0)
	sb_data7(w0)

	sb_data0(w1)
	sb_data1(w1)
	sb_data2(w1)
	sb_data3(w1)
	sb_data4(w1)
	sb_data5(w1)
	sb_data6(w1)
	sb_data7(w1)

	sb_data0(w2)
	sb_data1(w2)
	sb_data2(w2)
	sb_data3(w2)
	sb_data4(w2)
	sb_data5(w2)
	sb_data6(w2)
	sb_data7(w2)

	sb_data0(w3)
	sb_data1(w3)
	sb_data2(w3)
	sb_data3(w3)
	sb_data4(w3)
	sb_data5(w3)
	sb_data6(w3)
	sb_data7(w3)

// The inverse xor tables

	.align	ALIGN64BYTES
aes_it_tab:
	ib_data0(v0)
	ib_data1(v0)
	ib_data2(v0)
	ib_data3(v0)
	ib_data4(v0)
	ib_data5(v0)
	ib_data6(v0)
	ib_data7(v0)

	ib_data0(v1)
	ib_data1(v1)
	ib_data2(v1)
	ib_data3(v1)
	ib_data4(v1)
	ib_data5(v1)
	ib_data6(v1)
	ib_data7(v1)

	ib_data0(v2)
	ib_data1(v2)
	ib_data2(v2)
	ib_data3(v2)
	ib_data4(v2)
	ib_data5(v2)
	ib_data6(v2)
	ib_data7(v2)

	ib_data0(v3)
	ib_data1(v3)
	ib_data2(v3)
	ib_data3(v3)
	ib_data4(v3)
	ib_data5(v3)
	ib_data6(v3)
	ib_data7(v3)

	.align	ALIGN64BYTES
aes_il_tab:
	ib_data0(w0)
	ib_data1(w0)
	ib_data2(w0)
	ib_data3(w0)
	ib_data4(w0)
	ib_data5(w0)
	ib_data6(w0)
	ib_data7(w0)

	ib_data0(w1)
	ib_data1(w1)
	ib_data2(w1)
	ib_data3(w1)
	ib_data4(w1)
	ib_data5(w1)
	ib_data6(w1)
	ib_data7(w1)

	ib_data0(w2)
	ib_data1(w2)
	ib_data2(w2)
	ib_data3(w2)
	ib_data4(w2)
	ib_data5(w2)
	ib_data6(w2)
	ib_data7(w2)

	ib_data0(w3)
	ib_data1(w3)
	ib_data2(w3)
	ib_data3(w3)
	ib_data4(w3)
	ib_data5(w3)
	ib_data6(w3)
	ib_data7(w3)

// The inverse mix column tables

	.align	ALIGN64BYTES
aes_im_tab:
	im_data0(v0)
	im_data1(v0)
	im_data2(v0)
	im_data3(v0)
	im_data4(v0)
	im_data5(v0)
	im_data6(v0)
	im_data7(v0)

	im_data0(v1)
	im_data1(v1)
	im_data2(v1)
	im_data3(v1)
	im_data4(v1)
	im_data5(v1)
	im_data6(v1)
	im_data7(v1)

	im_data0(v2)
	im_data1(v2)
	im_data2(v2)
	im_data3(v2)
	im_data4(v2)
	im_data5(v2)
	im_data6(v2)
	im_data7(v2)

	im_data0(v3)
	im_data1(v3)
	im_data2(v3)
	im_data3(v3)
	im_data4(v3)
	im_data5(v3)
	im_data6(v3)
	im_data7(v3)

#if defined(__ELF__) && defined(SECTION_NOTE_GNU_STACK)
.section .note.GNU-stack,"",@progbits
#endif
