summaryrefslogtreecommitdiffstats
path: root/cryptopp562/x64masm.asm
diff options
context:
space:
mode:
Diffstat (limited to 'cryptopp562/x64masm.asm')
-rw-r--r--cryptopp562/x64masm.asm1565
1 files changed, 1565 insertions, 0 deletions
diff --git a/cryptopp562/x64masm.asm b/cryptopp562/x64masm.asm
new file mode 100644
index 0000000..c7ff836
--- /dev/null
+++ b/cryptopp562/x64masm.asm
@@ -0,0 +1,1565 @@
+include ksamd64.inc
+EXTERNDEF s_sosemanukMulTables:FAR
+.CODE
+
+ALIGN 8
+Salsa20_OperateKeystream PROC FRAME
+mov r10, [rsp + 5*8]
+alloc_stack(10*16 + 32*16 + 8)
+save_xmm128 xmm6, 0200h
+save_xmm128 xmm7, 0210h
+save_xmm128 xmm8, 0220h
+save_xmm128 xmm9, 0230h
+save_xmm128 xmm10, 0240h
+save_xmm128 xmm11, 0250h
+save_xmm128 xmm12, 0260h
+save_xmm128 xmm13, 0270h
+save_xmm128 xmm14, 0280h
+save_xmm128 xmm15, 0290h
+.endprolog
+cmp r8, 4
+jl label5
+movdqa xmm0, [r10 + 0*16]
+movdqa xmm1, [r10 + 1*16]
+movdqa xmm2, [r10 + 2*16]
+movdqa xmm3, [r10 + 3*16]
+pshufd xmm4, xmm0, 0*64+0*16+0*4+0
+movdqa [rsp + (0*4+0)*16 + 256], xmm4
+pshufd xmm4, xmm0, 1*64+1*16+1*4+1
+movdqa [rsp + (0*4+1)*16 + 256], xmm4
+pshufd xmm4, xmm0, 2*64+2*16+2*4+2
+movdqa [rsp + (0*4+2)*16 + 256], xmm4
+pshufd xmm4, xmm0, 3*64+3*16+3*4+3
+movdqa [rsp + (0*4+3)*16 + 256], xmm4
+pshufd xmm4, xmm1, 0*64+0*16+0*4+0
+movdqa [rsp + (1*4+0)*16 + 256], xmm4
+pshufd xmm4, xmm1, 2*64+2*16+2*4+2
+movdqa [rsp + (1*4+2)*16 + 256], xmm4
+pshufd xmm4, xmm1, 3*64+3*16+3*4+3
+movdqa [rsp + (1*4+3)*16 + 256], xmm4
+pshufd xmm4, xmm2, 1*64+1*16+1*4+1
+movdqa [rsp + (2*4+1)*16 + 256], xmm4
+pshufd xmm4, xmm2, 2*64+2*16+2*4+2
+movdqa [rsp + (2*4+2)*16 + 256], xmm4
+pshufd xmm4, xmm2, 3*64+3*16+3*4+3
+movdqa [rsp + (2*4+3)*16 + 256], xmm4
+pshufd xmm4, xmm3, 0*64+0*16+0*4+0
+movdqa [rsp + (3*4+0)*16 + 256], xmm4
+pshufd xmm4, xmm3, 1*64+1*16+1*4+1
+movdqa [rsp + (3*4+1)*16 + 256], xmm4
+pshufd xmm4, xmm3, 2*64+2*16+2*4+2
+movdqa [rsp + (3*4+2)*16 + 256], xmm4
+pshufd xmm4, xmm3, 3*64+3*16+3*4+3
+movdqa [rsp + (3*4+3)*16 + 256], xmm4
+label1:
+mov eax, dword ptr [r10 + 8*4]
+mov r11d, dword ptr [r10 + 5*4]
+mov dword ptr [rsp + 8*16 + 0*4 + 256], eax
+mov dword ptr [rsp + 5*16 + 0*4 + 256], r11d
+add eax, 1
+adc r11d, 0
+mov dword ptr [rsp + 8*16 + 1*4 + 256], eax
+mov dword ptr [rsp + 5*16 + 1*4 + 256], r11d
+add eax, 1
+adc r11d, 0
+mov dword ptr [rsp + 8*16 + 2*4 + 256], eax
+mov dword ptr [rsp + 5*16 + 2*4 + 256], r11d
+add eax, 1
+adc r11d, 0
+mov dword ptr [rsp + 8*16 + 3*4 + 256], eax
+mov dword ptr [rsp + 5*16 + 3*4 + 256], r11d
+add eax, 1
+adc r11d, 0
+mov dword ptr [r10 + 8*4], eax
+mov dword ptr [r10 + 5*4], r11d
+movdqa xmm0, [rsp + 12*16 + 1*256]
+movdqa xmm4, [rsp + 13*16 + 1*256]
+movdqa xmm8, [rsp + 14*16 + 1*256]
+movdqa xmm12, [rsp + 15*16 + 1*256]
+movdqa xmm2, [rsp + 0*16 + 1*256]
+movdqa xmm6, [rsp + 1*16 + 1*256]
+movdqa xmm10, [rsp + 2*16 + 1*256]
+movdqa xmm14, [rsp + 3*16 + 1*256]
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 7
+pslld xmm4, 7
+pslld xmm8, 7
+pslld xmm12, 7
+psrld xmm1, 32-7
+psrld xmm5, 32-7
+psrld xmm9, 32-7
+psrld xmm13, 32-7
+pxor xmm0, [rsp + 4*16 + 1*256]
+pxor xmm4, [rsp + 5*16 + 1*256]
+pxor xmm8, [rsp + 6*16 + 1*256]
+pxor xmm12, [rsp + 7*16 + 1*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 4*16], xmm0
+movdqa [rsp + 5*16], xmm4
+movdqa [rsp + 6*16], xmm8
+movdqa [rsp + 7*16], xmm12
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 9
+pslld xmm4, 9
+pslld xmm8, 9
+pslld xmm12, 9
+psrld xmm3, 32-9
+psrld xmm7, 32-9
+psrld xmm11, 32-9
+psrld xmm15, 32-9
+pxor xmm0, [rsp + 8*16 + 1*256]
+pxor xmm4, [rsp + 9*16 + 1*256]
+pxor xmm8, [rsp + 10*16 + 1*256]
+pxor xmm12, [rsp + 11*16 + 1*256]
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 8*16], xmm0
+movdqa [rsp + 9*16], xmm4
+movdqa [rsp + 10*16], xmm8
+movdqa [rsp + 11*16], xmm12
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+paddd xmm0, xmm1
+paddd xmm4, xmm5
+paddd xmm8, xmm9
+paddd xmm12, xmm13
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 13
+pslld xmm4, 13
+pslld xmm8, 13
+pslld xmm12, 13
+psrld xmm1, 32-13
+psrld xmm5, 32-13
+psrld xmm9, 32-13
+psrld xmm13, 32-13
+pxor xmm0, [rsp + 12*16 + 1*256]
+pxor xmm4, [rsp + 13*16 + 1*256]
+pxor xmm8, [rsp + 14*16 + 1*256]
+pxor xmm12, [rsp + 15*16 + 1*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 12*16], xmm0
+movdqa [rsp + 13*16], xmm4
+movdqa [rsp + 14*16], xmm8
+movdqa [rsp + 15*16], xmm12
+paddd xmm0, xmm3
+paddd xmm4, xmm7
+paddd xmm8, xmm11
+paddd xmm12, xmm15
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 18
+pslld xmm4, 18
+pslld xmm8, 18
+pslld xmm12, 18
+psrld xmm3, 32-18
+psrld xmm7, 32-18
+psrld xmm11, 32-18
+psrld xmm15, 32-18
+pxor xmm0, xmm2
+pxor xmm4, xmm6
+pxor xmm8, xmm10
+pxor xmm12, xmm14
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 0*16], xmm0
+movdqa [rsp + 1*16], xmm4
+movdqa [rsp + 2*16], xmm8
+movdqa [rsp + 3*16], xmm12
+mov rax, r9
+jmp label2
+labelSSE2_Salsa_Output:
+movdqa xmm0, xmm4
+punpckldq xmm4, xmm5
+movdqa xmm1, xmm6
+punpckldq xmm6, xmm7
+movdqa xmm2, xmm4
+punpcklqdq xmm4, xmm6
+punpckhqdq xmm2, xmm6
+punpckhdq xmm0, xmm5
+punpckhdq xmm1, xmm7
+movdqa xmm6, xmm0
+punpcklqdq xmm0, xmm1
+punpckhqdq xmm6, xmm1
+test rdx, rdx
+jz labelSSE2_Salsa_Output_A3
+test rdx, 15
+jnz labelSSE2_Salsa_Output_A7
+pxor xmm4, [rdx+0*16]
+pxor xmm2, [rdx+4*16]
+pxor xmm0, [rdx+8*16]
+pxor xmm6, [rdx+12*16]
+add rdx, 1*16
+jmp labelSSE2_Salsa_Output_A3
+labelSSE2_Salsa_Output_A7:
+movdqu xmm1, [rdx+0*16]
+pxor xmm4, xmm1
+movdqu xmm1, [rdx+4*16]
+pxor xmm2, xmm1
+movdqu xmm1, [rdx+8*16]
+pxor xmm0, xmm1
+movdqu xmm1, [rdx+12*16]
+pxor xmm6, xmm1
+add rdx, 1*16
+labelSSE2_Salsa_Output_A3:
+test rcx, 15
+jnz labelSSE2_Salsa_Output_A8
+movdqa [rcx+0*16], xmm4
+movdqa [rcx+4*16], xmm2
+movdqa [rcx+8*16], xmm0
+movdqa [rcx+12*16], xmm6
+jmp labelSSE2_Salsa_Output_A9
+labelSSE2_Salsa_Output_A8:
+movdqu [rcx+0*16], xmm4
+movdqu [rcx+4*16], xmm2
+movdqu [rcx+8*16], xmm0
+movdqu [rcx+12*16], xmm6
+labelSSE2_Salsa_Output_A9:
+add rcx, 1*16
+ret
+label6:
+movdqa xmm0, [rsp + 12*16 + 0*256]
+movdqa xmm4, [rsp + 13*16 + 0*256]
+movdqa xmm8, [rsp + 14*16 + 0*256]
+movdqa xmm12, [rsp + 15*16 + 0*256]
+movdqa xmm2, [rsp + 0*16 + 0*256]
+movdqa xmm6, [rsp + 1*16 + 0*256]
+movdqa xmm10, [rsp + 2*16 + 0*256]
+movdqa xmm14, [rsp + 3*16 + 0*256]
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 7
+pslld xmm4, 7
+pslld xmm8, 7
+pslld xmm12, 7
+psrld xmm1, 32-7
+psrld xmm5, 32-7
+psrld xmm9, 32-7
+psrld xmm13, 32-7
+pxor xmm0, [rsp + 4*16 + 0*256]
+pxor xmm4, [rsp + 5*16 + 0*256]
+pxor xmm8, [rsp + 6*16 + 0*256]
+pxor xmm12, [rsp + 7*16 + 0*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 4*16], xmm0
+movdqa [rsp + 5*16], xmm4
+movdqa [rsp + 6*16], xmm8
+movdqa [rsp + 7*16], xmm12
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 9
+pslld xmm4, 9
+pslld xmm8, 9
+pslld xmm12, 9
+psrld xmm3, 32-9
+psrld xmm7, 32-9
+psrld xmm11, 32-9
+psrld xmm15, 32-9
+pxor xmm0, [rsp + 8*16 + 0*256]
+pxor xmm4, [rsp + 9*16 + 0*256]
+pxor xmm8, [rsp + 10*16 + 0*256]
+pxor xmm12, [rsp + 11*16 + 0*256]
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 8*16], xmm0
+movdqa [rsp + 9*16], xmm4
+movdqa [rsp + 10*16], xmm8
+movdqa [rsp + 11*16], xmm12
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+paddd xmm0, xmm1
+paddd xmm4, xmm5
+paddd xmm8, xmm9
+paddd xmm12, xmm13
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 13
+pslld xmm4, 13
+pslld xmm8, 13
+pslld xmm12, 13
+psrld xmm1, 32-13
+psrld xmm5, 32-13
+psrld xmm9, 32-13
+psrld xmm13, 32-13
+pxor xmm0, [rsp + 12*16 + 0*256]
+pxor xmm4, [rsp + 13*16 + 0*256]
+pxor xmm8, [rsp + 14*16 + 0*256]
+pxor xmm12, [rsp + 15*16 + 0*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 12*16], xmm0
+movdqa [rsp + 13*16], xmm4
+movdqa [rsp + 14*16], xmm8
+movdqa [rsp + 15*16], xmm12
+paddd xmm0, xmm3
+paddd xmm4, xmm7
+paddd xmm8, xmm11
+paddd xmm12, xmm15
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 18
+pslld xmm4, 18
+pslld xmm8, 18
+pslld xmm12, 18
+psrld xmm3, 32-18
+psrld xmm7, 32-18
+psrld xmm11, 32-18
+psrld xmm15, 32-18
+pxor xmm0, xmm2
+pxor xmm4, xmm6
+pxor xmm8, xmm10
+pxor xmm12, xmm14
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 0*16], xmm0
+movdqa [rsp + 1*16], xmm4
+movdqa [rsp + 2*16], xmm8
+movdqa [rsp + 3*16], xmm12
+label2:
+movdqa xmm0, [rsp + 7*16 + 0*256]
+movdqa xmm4, [rsp + 4*16 + 0*256]
+movdqa xmm8, [rsp + 5*16 + 0*256]
+movdqa xmm12, [rsp + 6*16 + 0*256]
+movdqa xmm2, [rsp + 0*16 + 0*256]
+movdqa xmm6, [rsp + 1*16 + 0*256]
+movdqa xmm10, [rsp + 2*16 + 0*256]
+movdqa xmm14, [rsp + 3*16 + 0*256]
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 7
+pslld xmm4, 7
+pslld xmm8, 7
+pslld xmm12, 7
+psrld xmm1, 32-7
+psrld xmm5, 32-7
+psrld xmm9, 32-7
+psrld xmm13, 32-7
+pxor xmm0, [rsp + 13*16 + 0*256]
+pxor xmm4, [rsp + 14*16 + 0*256]
+pxor xmm8, [rsp + 15*16 + 0*256]
+pxor xmm12, [rsp + 12*16 + 0*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 13*16], xmm0
+movdqa [rsp + 14*16], xmm4
+movdqa [rsp + 15*16], xmm8
+movdqa [rsp + 12*16], xmm12
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+paddd xmm0, xmm2
+paddd xmm4, xmm6
+paddd xmm8, xmm10
+paddd xmm12, xmm14
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 9
+pslld xmm4, 9
+pslld xmm8, 9
+pslld xmm12, 9
+psrld xmm3, 32-9
+psrld xmm7, 32-9
+psrld xmm11, 32-9
+psrld xmm15, 32-9
+pxor xmm0, [rsp + 10*16 + 0*256]
+pxor xmm4, [rsp + 11*16 + 0*256]
+pxor xmm8, [rsp + 8*16 + 0*256]
+pxor xmm12, [rsp + 9*16 + 0*256]
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 10*16], xmm0
+movdqa [rsp + 11*16], xmm4
+movdqa [rsp + 8*16], xmm8
+movdqa [rsp + 9*16], xmm12
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+paddd xmm0, xmm1
+paddd xmm4, xmm5
+paddd xmm8, xmm9
+paddd xmm12, xmm13
+movdqa xmm1, xmm0
+movdqa xmm5, xmm4
+movdqa xmm9, xmm8
+movdqa xmm13, xmm12
+pslld xmm0, 13
+pslld xmm4, 13
+pslld xmm8, 13
+pslld xmm12, 13
+psrld xmm1, 32-13
+psrld xmm5, 32-13
+psrld xmm9, 32-13
+psrld xmm13, 32-13
+pxor xmm0, [rsp + 7*16 + 0*256]
+pxor xmm4, [rsp + 4*16 + 0*256]
+pxor xmm8, [rsp + 5*16 + 0*256]
+pxor xmm12, [rsp + 6*16 + 0*256]
+pxor xmm0, xmm1
+pxor xmm4, xmm5
+pxor xmm8, xmm9
+pxor xmm12, xmm13
+movdqa [rsp + 7*16], xmm0
+movdqa [rsp + 4*16], xmm4
+movdqa [rsp + 5*16], xmm8
+movdqa [rsp + 6*16], xmm12
+paddd xmm0, xmm3
+paddd xmm4, xmm7
+paddd xmm8, xmm11
+paddd xmm12, xmm15
+movdqa xmm3, xmm0
+movdqa xmm7, xmm4
+movdqa xmm11, xmm8
+movdqa xmm15, xmm12
+pslld xmm0, 18
+pslld xmm4, 18
+pslld xmm8, 18
+pslld xmm12, 18
+psrld xmm3, 32-18
+psrld xmm7, 32-18
+psrld xmm11, 32-18
+psrld xmm15, 32-18
+pxor xmm0, xmm2
+pxor xmm4, xmm6
+pxor xmm8, xmm10
+pxor xmm12, xmm14
+pxor xmm0, xmm3
+pxor xmm4, xmm7
+pxor xmm8, xmm11
+pxor xmm12, xmm15
+movdqa [rsp + 0*16], xmm0
+movdqa [rsp + 1*16], xmm4
+movdqa [rsp + 2*16], xmm8
+movdqa [rsp + 3*16], xmm12
+sub eax, 2
+jnz label6
+movdqa xmm4, [rsp + 0*16 + 256]
+paddd xmm4, [rsp + 0*16]
+movdqa xmm5, [rsp + 13*16 + 256]
+paddd xmm5, [rsp + 13*16]
+movdqa xmm6, [rsp + 10*16 + 256]
+paddd xmm6, [rsp + 10*16]
+movdqa xmm7, [rsp + 7*16 + 256]
+paddd xmm7, [rsp + 7*16]
+call labelSSE2_Salsa_Output
+movdqa xmm4, [rsp + 4*16 + 256]
+paddd xmm4, [rsp + 4*16]
+movdqa xmm5, [rsp + 1*16 + 256]
+paddd xmm5, [rsp + 1*16]
+movdqa xmm6, [rsp + 14*16 + 256]
+paddd xmm6, [rsp + 14*16]
+movdqa xmm7, [rsp + 11*16 + 256]
+paddd xmm7, [rsp + 11*16]
+call labelSSE2_Salsa_Output
+movdqa xmm4, [rsp + 8*16 + 256]
+paddd xmm4, [rsp + 8*16]
+movdqa xmm5, [rsp + 5*16 + 256]
+paddd xmm5, [rsp + 5*16]
+movdqa xmm6, [rsp + 2*16 + 256]
+paddd xmm6, [rsp + 2*16]
+movdqa xmm7, [rsp + 15*16 + 256]
+paddd xmm7, [rsp + 15*16]
+call labelSSE2_Salsa_Output
+movdqa xmm4, [rsp + 12*16 + 256]
+paddd xmm4, [rsp + 12*16]
+movdqa xmm5, [rsp + 9*16 + 256]
+paddd xmm5, [rsp + 9*16]
+movdqa xmm6, [rsp + 6*16 + 256]
+paddd xmm6, [rsp + 6*16]
+movdqa xmm7, [rsp + 3*16 + 256]
+paddd xmm7, [rsp + 3*16]
+call labelSSE2_Salsa_Output
+test rdx, rdx
+jz label9
+add rdx, 12*16
+label9:
+add rcx, 12*16
+sub r8, 4
+cmp r8, 4
+jge label1
+label5:
+sub r8, 1
+jl label4
+movdqa xmm0, [r10 + 0*16]
+movdqa xmm1, [r10 + 1*16]
+movdqa xmm2, [r10 + 2*16]
+movdqa xmm3, [r10 + 3*16]
+mov rax, r9
+label0:
+movdqa xmm4, xmm3
+paddd xmm4, xmm0
+movdqa xmm5, xmm4
+pslld xmm4, 7
+psrld xmm5, 32-7
+pxor xmm1, xmm4
+pxor xmm1, xmm5
+movdqa xmm4, xmm0
+paddd xmm4, xmm1
+movdqa xmm5, xmm4
+pslld xmm4, 9
+psrld xmm5, 32-9
+pxor xmm2, xmm4
+pxor xmm2, xmm5
+movdqa xmm4, xmm1
+paddd xmm4, xmm2
+movdqa xmm5, xmm4
+pslld xmm4, 13
+psrld xmm5, 32-13
+pxor xmm3, xmm4
+pxor xmm3, xmm5
+movdqa xmm4, xmm2
+paddd xmm4, xmm3
+movdqa xmm5, xmm4
+pslld xmm4, 18
+psrld xmm5, 32-18
+pxor xmm0, xmm4
+pxor xmm0, xmm5
+pshufd xmm1, xmm1, 2*64+1*16+0*4+3
+pshufd xmm2, xmm2, 1*64+0*16+3*4+2
+pshufd xmm3, xmm3, 0*64+3*16+2*4+1
+movdqa xmm4, xmm1
+paddd xmm4, xmm0
+movdqa xmm5, xmm4
+pslld xmm4, 7
+psrld xmm5, 32-7
+pxor xmm3, xmm4
+pxor xmm3, xmm5
+movdqa xmm4, xmm0
+paddd xmm4, xmm3
+movdqa xmm5, xmm4
+pslld xmm4, 9
+psrld xmm5, 32-9
+pxor xmm2, xmm4
+pxor xmm2, xmm5
+movdqa xmm4, xmm3
+paddd xmm4, xmm2
+movdqa xmm5, xmm4
+pslld xmm4, 13
+psrld xmm5, 32-13
+pxor xmm1, xmm4
+pxor xmm1, xmm5
+movdqa xmm4, xmm2
+paddd xmm4, xmm1
+movdqa xmm5, xmm4
+pslld xmm4, 18
+psrld xmm5, 32-18
+pxor xmm0, xmm4
+pxor xmm0, xmm5
+pshufd xmm1, xmm1, 0*64+3*16+2*4+1
+pshufd xmm2, xmm2, 1*64+0*16+3*4+2
+pshufd xmm3, xmm3, 2*64+1*16+0*4+3
+sub eax, 2
+jnz label0
+paddd xmm0, [r10 + 0*16]
+paddd xmm1, [r10 + 1*16]
+paddd xmm2, [r10 + 2*16]
+paddd xmm3, [r10 + 3*16]
+add dword ptr [r10 + 8*4], 1
+adc dword ptr [r10 + 5*4], 0
+pcmpeqb xmm6, xmm6
+psrlq xmm6, 32
+pshufd xmm7, xmm6, 0*64+1*16+2*4+3
+movdqa xmm4, xmm0
+movdqa xmm5, xmm3
+pand xmm0, xmm7
+pand xmm4, xmm6
+pand xmm3, xmm6
+pand xmm5, xmm7
+por xmm4, xmm5
+movdqa xmm5, xmm1
+pand xmm1, xmm7
+pand xmm5, xmm6
+por xmm0, xmm5
+pand xmm6, xmm2
+pand xmm2, xmm7
+por xmm1, xmm6
+por xmm2, xmm3
+movdqa xmm5, xmm4
+movdqa xmm6, xmm0
+shufpd xmm4, xmm1, 2
+shufpd xmm0, xmm2, 2
+shufpd xmm1, xmm5, 2
+shufpd xmm2, xmm6, 2
+test rdx, rdx
+jz labelSSE2_Salsa_Output_B3
+test rdx, 15
+jnz labelSSE2_Salsa_Output_B7
+pxor xmm4, [rdx+0*16]
+pxor xmm0, [rdx+1*16]
+pxor xmm1, [rdx+2*16]
+pxor xmm2, [rdx+3*16]
+add rdx, 4*16
+jmp labelSSE2_Salsa_Output_B3
+labelSSE2_Salsa_Output_B7:
+movdqu xmm3, [rdx+0*16]
+pxor xmm4, xmm3
+movdqu xmm3, [rdx+1*16]
+pxor xmm0, xmm3
+movdqu xmm3, [rdx+2*16]
+pxor xmm1, xmm3
+movdqu xmm3, [rdx+3*16]
+pxor xmm2, xmm3
+add rdx, 4*16
+labelSSE2_Salsa_Output_B3:
+test rcx, 15
+jnz labelSSE2_Salsa_Output_B8
+movdqa [rcx+0*16], xmm4
+movdqa [rcx+1*16], xmm0
+movdqa [rcx+2*16], xmm1
+movdqa [rcx+3*16], xmm2
+jmp labelSSE2_Salsa_Output_B9
+labelSSE2_Salsa_Output_B8:
+movdqu [rcx+0*16], xmm4
+movdqu [rcx+1*16], xmm0
+movdqu [rcx+2*16], xmm1
+movdqu [rcx+3*16], xmm2
+labelSSE2_Salsa_Output_B9:
+add rcx, 4*16
+jmp label5
+label4:
+movdqa xmm6, [rsp + 0200h]
+movdqa xmm7, [rsp + 0210h]
+movdqa xmm8, [rsp + 0220h]
+movdqa xmm9, [rsp + 0230h]
+movdqa xmm10, [rsp + 0240h]
+movdqa xmm11, [rsp + 0250h]
+movdqa xmm12, [rsp + 0260h]
+movdqa xmm13, [rsp + 0270h]
+movdqa xmm14, [rsp + 0280h]
+movdqa xmm15, [rsp + 0290h]
+add rsp, 10*16 + 32*16 + 8
+ret
+Salsa20_OperateKeystream ENDP
+
+ALIGN 8
+Sosemanuk_OperateKeystream PROC FRAME
+rex_push_reg rsi
+push_reg rdi
+alloc_stack(80*4*2+12*4+8*8 + 2*16+8)
+save_xmm128 xmm6, 02f0h
+save_xmm128 xmm7, 0300h
+.endprolog
+mov rdi, r8
+mov rax, r9
+mov QWORD PTR [rsp+1*8], rdi
+mov QWORD PTR [rsp+2*8], rdx
+mov QWORD PTR [rsp+6*8], rax
+lea rcx, [4*rcx+rcx]
+lea rsi, [4*rcx]
+mov QWORD PTR [rsp+3*8], rsi
+movdqa xmm0, [rax+0*16]
+movdqa [rsp + 8*8+0*16], xmm0
+movdqa xmm0, [rax+1*16]
+movdqa [rsp + 8*8+1*16], xmm0
+movq xmm0, QWORD PTR [rax+2*16]
+movq QWORD PTR [rsp + 8*8+2*16], xmm0
+psrlq xmm0, 32
+movd r10d, xmm0
+mov ecx, [rax+10*4]
+mov edx, [rax+11*4]
+pcmpeqb xmm7, xmm7
+label2:
+lea rdi, [rsp + 8*8 + 12*4]
+mov rax, 80
+cmp rsi, 80
+cmovg rsi, rax
+mov QWORD PTR [rsp+7*8], rsi
+lea rsi, [rdi+rsi]
+mov QWORD PTR [rsp+4*8], rsi
+lea rsi, s_sosemanukMulTables
+label0:
+mov eax, [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4]
+mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((0)-((0)/(4))*(4))*20 + (0/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((0+3)-((0+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((0+2)-((0+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((0+0)-((0+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4]
+mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((1)-((1)/(4))*(4))*20 + (1/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((1+3)-((1+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((1+2)-((1+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((1+0)-((1+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4]
+mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((2)-((2)/(4))*(4))*20 + (2/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((2+3)-((2+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((2+2)-((2+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((2+0)-((2+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4]
+mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((3)-((3)/(4))*(4))*20 + (3/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((3+3)-((3+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((3+2)-((3+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((3+0)-((3+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4]
+mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((4)-((4)/(4))*(4))*20 + (4/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((4+3)-((4+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((4+2)-((4+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((4+0)-((4+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4]
+mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((5)-((5)/(4))*(4))*20 + (5/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((5+3)-((5+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((5+2)-((5+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((5+0)-((5+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4]
+mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((6)-((6)/(4))*(4))*20 + (6/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((6+3)-((6+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((6+2)-((6+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((6+0)-((6+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4]
+mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((7)-((7)/(4))*(4))*20 + (7/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((7+3)-((7+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((7+2)-((7+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((7+0)-((7+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4]
+mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((8)-((8)/(4))*(4))*20 + (8/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((8+3)-((8+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((8+2)-((8+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((8+0)-((8+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4]
+mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((9)-((9)/(4))*(4))*20 + (9/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((9+3)-((9+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((9+2)-((9+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((9+0)-((9+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4]
+mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((10)-((10)/(4))*(4))*20 + (10/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((10+3)-((10+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((10+2)-((10+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((10+0)-((10+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4]
+mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((11)-((11)/(4))*(4))*20 + (11/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((11+3)-((11+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((11+2)-((11+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((11+0)-((11+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4]
+mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((12)-((12)/(4))*(4))*20 + (12/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((12+3)-((12+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((12+2)-((12+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((12+0)-((12+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4]
+mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((13)-((13)/(4))*(4))*20 + (13/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((13+3)-((13+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((13+2)-((13+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((13+0)-((13+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4]
+mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((14)-((14)/(4))*(4))*20 + (14/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((14+3)-((14+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((14+2)-((14+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((14+0)-((14+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4]
+mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((15)-((15)/(4))*(4))*20 + (15/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((15+3)-((15+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((15+2)-((15+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((15+0)-((15+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4]
+mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((16)-((16)/(4))*(4))*20 + (16/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((16+3)-((16+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((16+2)-((16+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((16+0)-((16+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4]
+mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((17)-((17)/(4))*(4))*20 + (17/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((17+3)-((17+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((17+2)-((17+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((17+0)-((17+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4]
+mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + edx]
+xor r11d, ecx
+mov [rdi + (((18)-((18)/(4))*(4))*20 + (18/4)) * 4], r11d
+mov r11d, 1
+and r11d, edx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((18+3)-((18+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((18+2)-((18+2)/(10))*(10))*4]
+add ecx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul edx, 54655307h
+rol edx, 7
+mov [rsp + 8*8 + ((18+0)-((18+0)/(10))*(10))*4], r10d
+mov eax, [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4]
+mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4 + 80*4], eax
+rol eax, 8
+lea r11d, [r10d + ecx]
+xor r11d, edx
+mov [rdi + (((19)-((19)/(4))*(4))*20 + (19/4)) * 4], r11d
+mov r11d, 1
+and r11d, ecx
+neg r11d
+and r11d, r10d
+xor r10d, eax
+movzx eax, al
+xor r10d, [rsi+rax*4]
+mov eax, [rsp + 8*8 + ((19+3)-((19+3)/(10))*(10))*4]
+xor r11d, [rsp + 8*8 + ((19+2)-((19+2)/(10))*(10))*4]
+add edx, r11d
+movzx r11d, al
+shr eax, 8
+xor r10d, [rsi+1024+r11*4]
+xor r10d, eax
+imul ecx, 54655307h
+rol ecx, 7
+mov [rsp + 8*8 + ((19+0)-((19+0)/(10))*(10))*4], r10d
+add rdi, 5*4
+cmp rdi, QWORD PTR [rsp+4*8]
+jne label0
+mov rax, QWORD PTR [rsp+2*8]
+mov r11, QWORD PTR [rsp+1*8]
+lea rdi, [rsp + 8*8 + 12*4]
+mov rsi, QWORD PTR [rsp+7*8]
+label1:
+movdqa xmm0, [rdi+0*20*4]
+movdqa xmm2, [rdi+2*20*4]
+movdqa xmm3, [rdi+3*20*4]
+movdqa xmm1, [rdi+1*20*4]
+movdqa xmm4, xmm0
+pand xmm0, xmm2
+pxor xmm0, xmm3
+pxor xmm2, xmm1
+pxor xmm2, xmm0
+por xmm3, xmm4
+pxor xmm3, xmm1
+pxor xmm4, xmm2
+movdqa xmm1, xmm3
+por xmm3, xmm4
+pxor xmm3, xmm0
+pand xmm0, xmm1
+pxor xmm4, xmm0
+pxor xmm1, xmm3
+pxor xmm1, xmm4
+pxor xmm4, xmm7
+pxor xmm2, [rdi+80*4]
+pxor xmm3, [rdi+80*5]
+pxor xmm1, [rdi+80*6]
+pxor xmm4, [rdi+80*7]
+cmp rsi, 16
+jl label4
+movdqa xmm6, xmm2
+punpckldq xmm2, xmm3
+movdqa xmm5, xmm1
+punpckldq xmm1, xmm4
+movdqa xmm0, xmm2
+punpcklqdq xmm2, xmm1
+punpckhqdq xmm0, xmm1
+punpckhdq xmm6, xmm3
+punpckhdq xmm5, xmm4
+movdqa xmm3, xmm6
+punpcklqdq xmm6, xmm5
+punpckhqdq xmm3, xmm5
+test rax, rax
+jz labelSSE2_Sosemanuk_Output3
+test rax, 15
+jnz labelSSE2_Sosemanuk_Output7
+pxor xmm2, [rax+0*16]
+pxor xmm0, [rax+1*16]
+pxor xmm6, [rax+2*16]
+pxor xmm3, [rax+3*16]
+add rax, 4*16
+jmp labelSSE2_Sosemanuk_Output3
+labelSSE2_Sosemanuk_Output7:
+movdqu xmm1, [rax+0*16]
+pxor xmm2, xmm1
+movdqu xmm1, [rax+1*16]
+pxor xmm0, xmm1
+movdqu xmm1, [rax+2*16]
+pxor xmm6, xmm1
+movdqu xmm1, [rax+3*16]
+pxor xmm3, xmm1
+add rax, 4*16
+labelSSE2_Sosemanuk_Output3:
+test r11, 15
+jnz labelSSE2_Sosemanuk_Output8
+movdqa [r11+0*16], xmm2
+movdqa [r11+1*16], xmm0
+movdqa [r11+2*16], xmm6
+movdqa [r11+3*16], xmm3
+jmp labelSSE2_Sosemanuk_Output9
+labelSSE2_Sosemanuk_Output8:
+movdqu [r11+0*16], xmm2
+movdqu [r11+1*16], xmm0
+movdqu [r11+2*16], xmm6
+movdqu [r11+3*16], xmm3
+labelSSE2_Sosemanuk_Output9:
+add r11, 4*16
+add rdi, 4*4
+sub rsi, 16
+jnz label1
+mov rsi, QWORD PTR [rsp+3*8]
+sub rsi, 80
+jz label6
+mov QWORD PTR [rsp+3*8], rsi
+mov QWORD PTR [rsp+2*8], rax
+mov QWORD PTR [rsp+1*8], r11
+jmp label2
+label4:
+test rax, rax
+jz label5
+movd xmm0, dword ptr [rax+0*4]
+pxor xmm2, xmm0
+movd xmm0, dword ptr [rax+1*4]
+pxor xmm3, xmm0
+movd xmm0, dword ptr [rax+2*4]
+pxor xmm1, xmm0
+movd xmm0, dword ptr [rax+3*4]
+pxor xmm4, xmm0
+add rax, 16
+label5:
+movd dword ptr [r11+0*4], xmm2
+movd dword ptr [r11+1*4], xmm3
+movd dword ptr [r11+2*4], xmm1
+movd dword ptr [r11+3*4], xmm4
+sub rsi, 4
+jz label6
+add r11, 16
+psrldq xmm2, 4
+psrldq xmm3, 4
+psrldq xmm1, 4
+psrldq xmm4, 4
+jmp label4
+label6:
+mov r10, QWORD PTR [rsp+6*8]
+movdqa xmm0, [rsp + 8*8+0*16]
+movdqa [r10+0*16], xmm0
+movdqa xmm0, [rsp + 8*8+1*16]
+movdqa [r10+1*16], xmm0
+movq xmm0, QWORD PTR [rsp + 8*8+2*16]
+movq QWORD PTR [r10+2*16], xmm0
+mov [r10+10*4], ecx
+mov [r10+11*4], edx
+movdqa xmm6, [rsp + 02f0h]
+movdqa xmm7, [rsp + 0300h]
+add rsp, 80*4*2+12*4+8*8 + 2*16+8
+pop rdi
+pop rsi
+ret
+Sosemanuk_OperateKeystream ENDP
+
+Panama_SSE2_Pull PROC FRAME
+rex_push_reg rdi
+alloc_stack(2*16)
+save_xmm128 xmm6, 0h
+save_xmm128 xmm7, 10h
+.endprolog
+shl rcx, 5
+jz label5
+mov r10d, [rdx+4*17]
+add rcx, r10
+mov rdi, rcx
+movdqa xmm0, xmmword ptr [rdx+0*16]
+movdqa xmm1, xmmword ptr [rdx+1*16]
+movdqa xmm2, xmmword ptr [rdx+2*16]
+movdqa xmm3, xmmword ptr [rdx+3*16]
+mov eax, dword ptr [rdx+4*16]
+label4:
+movdqa xmm6, xmm2
+movss xmm6, xmm3
+pshufd xmm5, xmm6, 0*64+3*16+2*4+1
+movd xmm6, eax
+movdqa xmm7, xmm3
+movss xmm7, xmm6
+pshufd xmm6, xmm7, 0*64+3*16+2*4+1
+movd ecx, xmm2
+not ecx
+movd r11d, xmm3
+or ecx, r11d
+xor eax, ecx
+pcmpeqb xmm7, xmm7
+pxor xmm7, xmm1
+por xmm7, xmm2
+pxor xmm7, xmm3
+movd ecx, xmm7
+rol ecx, (((((5*1) MOD (17))*(((5*1) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(1)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*5) MOD (17))*(((5*5) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(5)) MOD (17)))*13+16)) MOD (17))*4], ecx
+punpckhqdq xmm7, xmm7
+movd ecx, xmm7
+rol ecx, (((((5*9) MOD (17))*(((5*9) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(9)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*13) MOD (17))*(((5*13) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(13)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pcmpeqb xmm7, xmm7
+pxor xmm7, xmm0
+por xmm7, xmm1
+pxor xmm7, xmm2
+movd ecx, xmm7
+rol ecx, (((((5*2) MOD (17))*(((5*2) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(2)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*6) MOD (17))*(((5*6) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(6)) MOD (17)))*13+16)) MOD (17))*4], ecx
+punpckhqdq xmm7, xmm7
+movd ecx, xmm7
+rol ecx, (((((5*10) MOD (17))*(((5*10) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(10)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*14) MOD (17))*(((5*14) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(14)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pcmpeqb xmm7, xmm7
+pxor xmm7, xmm6
+por xmm7, xmm0
+pxor xmm7, xmm1
+movd ecx, xmm7
+rol ecx, (((((5*3) MOD (17))*(((5*3) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(3)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*7) MOD (17))*(((5*7) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(7)) MOD (17)))*13+16)) MOD (17))*4], ecx
+punpckhqdq xmm7, xmm7
+movd ecx, xmm7
+rol ecx, (((((5*11) MOD (17))*(((5*11) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(11)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*15) MOD (17))*(((5*15) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(15)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pcmpeqb xmm7, xmm7
+pxor xmm7, xmm5
+por xmm7, xmm6
+pxor xmm7, xmm0
+movd ecx, xmm7
+rol ecx, (((((5*4) MOD (17))*(((5*4) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(4)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*8) MOD (17))*(((5*8) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(8)) MOD (17)))*13+16)) MOD (17))*4], ecx
+punpckhqdq xmm7, xmm7
+movd ecx, xmm7
+rol ecx, (((((5*12) MOD (17))*(((5*12) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(12)) MOD (17)))*13+16)) MOD (17))*4], ecx
+pshuflw xmm7, xmm7, 1*64+0*16+3*4+2
+movd ecx, xmm7
+rol ecx, (((((5*16) MOD (17))*(((5*16) MOD (17))+1)/2)) MOD (32))
+mov [rdx+((((((5*(16)) MOD (17)))*13+16)) MOD (17))*4], ecx
+movdqa xmm4, xmm3
+punpcklqdq xmm3, xmm2
+punpckhdq xmm4, xmm2
+movdqa xmm2, xmm1
+punpcklqdq xmm1, xmm0
+punpckhdq xmm2, xmm0
+test r8, r8
+jz label0
+movdqa xmm6, xmm4
+punpcklqdq xmm4, xmm2
+punpckhqdq xmm6, xmm2
+test r9, 15
+jnz label2
+test r9, r9
+jz label1
+pxor xmm4, [r9]
+pxor xmm6, [r9+16]
+add r9, 32
+jmp label1
+label2:
+movdqu xmm0, [r9]
+movdqu xmm2, [r9+16]
+pxor xmm4, xmm0
+pxor xmm6, xmm2
+add r9, 32
+label1:
+test r8, 15
+jnz label3
+movdqa xmmword ptr [r8], xmm4
+movdqa xmmword ptr [r8+16], xmm6
+add r8, 32
+jmp label0
+label3:
+movdqu xmmword ptr [r8], xmm4
+movdqu xmmword ptr [r8+16], xmm6
+add r8, 32
+label0:
+lea rcx, [r10 + 32]
+and rcx, 31*32
+lea r11, [r10 + (32-24)*32]
+and r11, 31*32
+movdqa xmm0, xmmword ptr [rdx+20*4+rcx+0*8]
+pxor xmm3, xmm0
+pshufd xmm0, xmm0, 2*64+3*16+0*4+1
+movdqa xmmword ptr [rdx+20*4+rcx+0*8], xmm3
+pxor xmm0, xmmword ptr [rdx+20*4+r11+2*8]
+movdqa xmmword ptr [rdx+20*4+r11+2*8], xmm0
+movdqa xmm4, xmmword ptr [rdx+20*4+rcx+2*8]
+pxor xmm1, xmm4
+movdqa xmmword ptr [rdx+20*4+rcx+2*8], xmm1
+pxor xmm4, xmmword ptr [rdx+20*4+r11+0*8]
+movdqa xmmword ptr [rdx+20*4+r11+0*8], xmm4
+movdqa xmm3, xmmword ptr [rdx+3*16]
+movdqa xmm2, xmmword ptr [rdx+2*16]
+movdqa xmm1, xmmword ptr [rdx+1*16]
+movdqa xmm0, xmmword ptr [rdx+0*16]
+movd xmm6, eax
+movdqa xmm7, xmm3
+movss xmm7, xmm6
+movdqa xmm6, xmm2
+movss xmm6, xmm3
+movdqa xmm5, xmm1
+movss xmm5, xmm2
+movdqa xmm4, xmm0
+movss xmm4, xmm1
+pshufd xmm7, xmm7, 0*64+3*16+2*4+1
+pshufd xmm6, xmm6, 0*64+3*16+2*4+1
+pshufd xmm5, xmm5, 0*64+3*16+2*4+1
+pshufd xmm4, xmm4, 0*64+3*16+2*4+1
+xor eax, 1
+movd ecx, xmm0
+xor eax, ecx
+movd ecx, xmm3
+xor eax, ecx
+pxor xmm3, xmm2
+pxor xmm2, xmm1
+pxor xmm1, xmm0
+pxor xmm0, xmm7
+pxor xmm3, xmm7
+pxor xmm2, xmm6
+pxor xmm1, xmm5
+pxor xmm0, xmm4
+lea rcx, [r10 + (32-4)*32]
+and rcx, 31*32
+lea r11, [r10 + 16*32]
+and r11, 31*32
+movdqa xmm4, xmmword ptr [rdx+20*4+rcx+0*16]
+movdqa xmm5, xmmword ptr [rdx+20*4+r11+0*16]
+movdqa xmm6, xmm4
+punpcklqdq xmm4, xmm5
+punpckhqdq xmm6, xmm5
+pxor xmm3, xmm4
+pxor xmm2, xmm6
+movdqa xmm4, xmmword ptr [rdx+20*4+rcx+1*16]
+movdqa xmm5, xmmword ptr [rdx+20*4+r11+1*16]
+movdqa xmm6, xmm4
+punpcklqdq xmm4, xmm5
+punpckhqdq xmm6, xmm5
+pxor xmm1, xmm4
+pxor xmm0, xmm6
+add r10, 32
+cmp r10, rdi
+jne label4
+mov [rdx+4*16], eax
+movdqa xmmword ptr [rdx+3*16], xmm3
+movdqa xmmword ptr [rdx+2*16], xmm2
+movdqa xmmword ptr [rdx+1*16], xmm1
+movdqa xmmword ptr [rdx+0*16], xmm0
+label5:
+movdqa xmm6, [rsp + 0h]
+movdqa xmm7, [rsp + 10h]
+add rsp, 2*16
+pop rdi
+ret
+Panama_SSE2_Pull ENDP
+
+_TEXT ENDS
+END