-rw-r--r-- 24544 high-ctidh-20210523/fp2048.S
/* DO NOT EDIT! generated by ./autogen */
.intel_syntax noprefix
#include "uintbig_namespace.h"
#include "fp_namespace.h"
.section .rodata
.set pbits,2047
.set pbytes,256
.set plimbs,32
.inv_min_p_mod_r: /* -p^-1 mod 2^64 */
.quad 0x22249689c54c7495
.global fp_0
fp_0:
.zero 256
.global fp_1
fp_1: /* 2^2048 mod p */
.quad 0x994d7dbe41f62437, 0x6aaf42d975b174b6, 0x3f037f5ba7c4a965, 0x5ccaed897fd53a00
.quad 0xd2973e879030fb33, 0x08c3a6b0fcf19681, 0x33301470a926eefd, 0x33e715b0a4a9b9e9
.quad 0x8737cc516cf9ace5, 0xf5464238325eccd4, 0x393cd9de4f760e82, 0x059880446fb9a315
.quad 0x8b19e3b333b22e4a, 0x65ac4ae7830805fa, 0xd71b975ca89c8fcd, 0x37314ebe2cf1f23b
.quad 0x565f6b8c9e61cfb9, 0x87712cf7de06573f, 0x6d8736050fb35ad2, 0xe3efa60224957edb
.quad 0x444a4fc8b855012d, 0xac7f2394665a0905, 0xcff83c43b74af366, 0x167df91c271503fd
.quad 0xd70947c16f7fc287, 0x65069931a3a5d5b7, 0xf713ec84671a7fce, 0x6c8a0b9c659af905
.quad 0x6600692af35042c7, 0x17670145e45b2b04, 0x38030a4d47b3b374, 0x355309fecf901ad2
.global fp_2
fp_2: /* 2^2049 mod p */
.quad 0xbb0a256699e8ff2b, 0xf8ee46a6129e1054, 0xe85d7e8087758b41, 0xd8842a40d4f18755
.quad 0x40b63c91a5c79f77, 0x69c884f24e33b484, 0x221ada5c355ad84e, 0x23c5dd46d58c0720
.quad 0xe62cdcbdfe46936c, 0x91a3efd87587ddef, 0xdae351b164137731, 0xb7b92b4a5a067c86
.quad 0x99e7134ccdf516ac, 0x4292041c31bd6348, 0xa095b682dec2a4df, 0xd61db7bbbe348a8b
.quad 0xc98950481c398f5a, 0x915d68ed060ecb93, 0xaa3b7e0bcf4d2940, 0xbe84835a555cd2aa
.quad 0xf4ad64d458c65815, 0x927dfdaf997cbfb6, 0x3a988c9e010437ef, 0xdf25efec5b310950
.quad 0xf5c05218aed4c5e5, 0x9664bac92882f2ac, 0x95d927df9b3dd4e1, 0xfd421b1797beefb8
.quad 0xee00f56437bb467b, 0xe145ada314d4b9b4, 0xd8071809a74df80e, 0x271717528efae93f
.r_squared_mod_p: /* (2^2048)^2 mod p */
.quad 0x6e940162ecb00f8c, 0x24c744036302b024, 0xc892a6ce3f16637b, 0x37733ccc6ac611b0
.quad 0xe5361a2cfb50bcf7, 0xe06f5b5f3f269a1f, 0x67d1d25e92181152, 0xab5a12bd6902fcfc
.quad 0x82c060ea7daaebd0, 0x22029cf3781c2b9e, 0x0cfdf6fb51053d83, 0xd0af69954ce04fb3
.quad 0x015a5254e7ef0c6d, 0x834839d5541a461a, 0x6768d972de4e269a, 0x71a06dcb2a0a7a7f
.quad 0x0e11b7d51a8ef22f, 0x9d655dcdeb3d4934, 0x28dfab822c934ea8, 0x60a2ba69a0862d99
.quad 0xf1286b89b68ee540, 0xb16c9ae335599258, 0x4a30cb66a64be15f, 0xd021ee65d62b98ce
.quad 0xae101149f5e60533, 0x2ad923e56bee4dc3, 0x5ccb61e26b6f93b5, 0xa4ee09d81cc595af
.quad 0xdc3497d89520ca1f, 0x2ab20fe509190878, 0xcb8fab09772c236e, 0x2dc5652f8085f6a5
.section .data
.global fp_mulsq_count
fp_mulsq_count:
.quad 0
.global fp_sq_count
fp_sq_count:
.quad 0
.global fp_addsub_count
fp_addsub_count:
.quad 0
.section .text
.p2align 4,,15
.global fp_copy
fp_copy:
cld
mov rcx, plimbs
rep movsq
ret
.global fp_cmov
fp_cmov:
movzx rax, dl
neg rax
.set k, 0
.rept plimbs
mov rcx, [rdi + 8*k]
mov rdx, [rsi + 8*k]
xor rdx, rcx
and rdx, rax
xor rcx, rdx
mov [rdi + 8*k], rcx
.set k, k+1
.endr
ret
.global fp_cswap
fp_cswap:
movzx rax, dl
neg rax
.set k, 0
.rept plimbs
mov rcx, [rdi + 8*k]
mov rdx, [rsi + 8*k]
mov r8, rcx
xor r8, rdx
and r8, rax
xor rcx, r8
xor rdx, r8
mov [rdi + 8*k], rcx
mov [rsi + 8*k], rdx
.set k, k+1
.endr
ret
.reduce_once:
push rbp
sub rsp, 192
mov rbp, rdi
mov rdi, [rbp + 0]
sub rdi, [rip + uintbig_p + 0]
movq [rsp + 0], rdi
mov rsi, [rbp + 8]
sbb rsi, [rip + uintbig_p + 8]
movq [rsp + 8], rsi
mov rdx, [rbp + 16]
sbb rdx, [rip + uintbig_p + 16]
movq [rsp + 16], rdx
mov rcx, [rbp + 24]
sbb rcx, [rip + uintbig_p + 24]
movq [rsp + 24], rcx
mov r8, [rbp + 32]
sbb r8, [rip + uintbig_p + 32]
movq [rsp + 32], r8
mov r9, [rbp + 40]
sbb r9, [rip + uintbig_p + 40]
movq [rsp + 40], r9
mov r10, [rbp + 48]
sbb r10, [rip + uintbig_p + 48]
movq [rsp + 48], r10
mov r11, [rbp + 56]
sbb r11, [rip + uintbig_p + 56]
movq [rsp + 56], r11
mov rdi, [rbp + 64]
sbb rdi, [rip + uintbig_p + 64]
movq [rsp + 64], rdi
mov rsi, [rbp + 72]
sbb rsi, [rip + uintbig_p + 72]
movq [rsp + 72], rsi
mov rdx, [rbp + 80]
sbb rdx, [rip + uintbig_p + 80]
movq [rsp + 80], rdx
mov rcx, [rbp + 88]
sbb rcx, [rip + uintbig_p + 88]
movq [rsp + 88], rcx
mov r8, [rbp + 96]
sbb r8, [rip + uintbig_p + 96]
movq [rsp + 96], r8
mov r9, [rbp + 104]
sbb r9, [rip + uintbig_p + 104]
movq [rsp + 104], r9
mov r10, [rbp + 112]
sbb r10, [rip + uintbig_p + 112]
movq [rsp + 112], r10
mov r11, [rbp + 120]
sbb r11, [rip + uintbig_p + 120]
movq [rsp + 120], r11
mov rdi, [rbp + 128]
sbb rdi, [rip + uintbig_p + 128]
movq [rsp + 128], rdi
mov rsi, [rbp + 136]
sbb rsi, [rip + uintbig_p + 136]
movq [rsp + 136], rsi
mov rdx, [rbp + 144]
sbb rdx, [rip + uintbig_p + 144]
movq [rsp + 144], rdx
mov rcx, [rbp + 152]
sbb rcx, [rip + uintbig_p + 152]
movq [rsp + 152], rcx
mov r8, [rbp + 160]
sbb r8, [rip + uintbig_p + 160]
movq [rsp + 160], r8
mov r9, [rbp + 168]
sbb r9, [rip + uintbig_p + 168]
movq [rsp + 168], r9
mov r10, [rbp + 176]
sbb r10, [rip + uintbig_p + 176]
movq [rsp + 176], r10
mov r11, [rbp + 184]
sbb r11, [rip + uintbig_p + 184]
movq [rsp + 184], r11
mov rdi, [rbp + 192]
sbb rdi, [rip + uintbig_p + 192]
mov rsi, [rbp + 200]
sbb rsi, [rip + uintbig_p + 200]
mov rdx, [rbp + 208]
sbb rdx, [rip + uintbig_p + 208]
mov rcx, [rbp + 216]
sbb rcx, [rip + uintbig_p + 216]
mov r8, [rbp + 224]
sbb r8, [rip + uintbig_p + 224]
mov r9, [rbp + 232]
sbb r9, [rip + uintbig_p + 232]
mov r10, [rbp + 240]
sbb r10, [rip + uintbig_p + 240]
mov r11, [rbp + 248]
sbb r11, [rip + uintbig_p + 248]
setnc al
movzx rax, al
neg rax
.macro cswap2, r, m
xor \r, \m
and \r, rax
xor \m, \r
.endm
cswap2 rdi, [rbp + 192]
cswap2 rsi, [rbp + 200]
cswap2 rdx, [rbp + 208]
cswap2 rcx, [rbp + 216]
cswap2 r8, [rbp + 224]
cswap2 r9, [rbp + 232]
cswap2 r10, [rbp + 240]
cswap2 r11, [rbp + 248]
movq rdi, [rsp + 0]
cswap2 rdi, [rbp + 0]
movq rsi, [rsp + 8]
cswap2 rsi, [rbp + 8]
movq rdx, [rsp + 16]
cswap2 rdx, [rbp + 16]
movq rcx, [rsp + 24]
cswap2 rcx, [rbp + 24]
movq r8, [rsp + 32]
cswap2 r8, [rbp + 32]
movq r9, [rsp + 40]
cswap2 r9, [rbp + 40]
movq r10, [rsp + 48]
cswap2 r10, [rbp + 48]
movq r11, [rsp + 56]
cswap2 r11, [rbp + 56]
movq rdi, [rsp + 64]
cswap2 rdi, [rbp + 64]
movq rsi, [rsp + 72]
cswap2 rsi, [rbp + 72]
movq rdx, [rsp + 80]
cswap2 rdx, [rbp + 80]
movq rcx, [rsp + 88]
cswap2 rcx, [rbp + 88]
movq r8, [rsp + 96]
cswap2 r8, [rbp + 96]
movq r9, [rsp + 104]
cswap2 r9, [rbp + 104]
movq r10, [rsp + 112]
cswap2 r10, [rbp + 112]
movq r11, [rsp + 120]
cswap2 r11, [rbp + 120]
movq rdi, [rsp + 128]
cswap2 rdi, [rbp + 128]
movq rsi, [rsp + 136]
cswap2 rsi, [rbp + 136]
movq rdx, [rsp + 144]
cswap2 rdx, [rbp + 144]
movq rcx, [rsp + 152]
cswap2 rcx, [rbp + 152]
movq r8, [rsp + 160]
cswap2 r8, [rbp + 160]
movq r9, [rsp + 168]
cswap2 r9, [rbp + 168]
movq r10, [rsp + 176]
cswap2 r10, [rbp + 176]
movq r11, [rsp + 184]
cswap2 r11, [rbp + 184]
add rsp, 192
pop rbp
ret
.global fp_add2
fp_add2:
mov rdx, rdi
.global fp_add3
fp_add3:
addq [fp_addsub_count+rip],1
push rdi
call uintbig_add3
pop rdi
jmp .reduce_once
.global fp_sub2
fp_sub2:
mov rdx, rdi
xchg rsi, rdx
.global fp_sub3
fp_sub3:
addq [fp_addsub_count+rip],1
push rdi
call uintbig_sub3
pop rdi
neg rax
sub rsp, pbytes
mov rcx, [rip + uintbig_p + 0]
and rcx, rax
mov [rsp + 0],rcx
.set k, 1
.rept plimbs-1
mov rcx, [rip + uintbig_p + 8*k]
and rcx, rax
mov [rsp + 8*k], rcx
.set k, k+1
.endr
mov rcx, [rsp + 0]
add rcx, [rdi + 0]
mov [rdi + 0], rcx
.set k, 1
.rept plimbs-1
mov rcx, [rsp + 8*k]
adc rcx, [rdi + 8*k]
mov [rdi + 8*k], rcx
.set k, k+1
.endr
add rsp, pbytes
ret
/* Montgomery arithmetic */
.global fp_mul2
fp_mul2:
mov rdx, rdi
.global fp_mul3
fp_mul3:
push rbp
push rbx
addq [fp_mulsq_count+rip],1
sub rsp,272
mov [rsp+264],rdi
mov rdi,rsi
mov rsi,rdx
/* XXX: put directly into output */
xor rax,rax
mov [rsp+0],rax
mov [rsp+8],rax
mov [rsp+16],rax
mov [rsp+24],rax
mov [rsp+32],rax
mov [rsp+40],rax
mov [rsp+48],rax
mov [rsp+56],rax
mov [rsp+64],rax
mov [rsp+72],rax
mov [rsp+80],rax
mov [rsp+88],rax
mov [rsp+96],rax
mov [rsp+104],rax
mov [rsp+112],rax
mov [rsp+120],rax
mov [rsp+128],rax
mov [rsp+136],rax
mov [rsp+144],rax
mov [rsp+152],rax
mov [rsp+160],rax
mov [rsp+168],rax
mov [rsp+176],rax
mov [rsp+184],rax
mov [rsp+192],rax
mov [rsp+200],rax
mov [rsp+208],rax
mov [rsp+216],rax
mov [rsp+224],rax
mov [rsp+232],rax
mov [rsp+240],rax
mov [rsp+248],rax
mov [rsp+256],rax
.macro MULSTEP, k, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, I16, I17, I18, I19, I20, I21, I22, I23, I24, I25, I26, I27, I28, I29, I30, I31, I32
mov r11,[rsp+\I0]
mov rdx, [rsi + 0]
mulx rcx, rdx, [rdi + 8*\k]
add rdx, r11
mulx rcx, rdx, [rip + .inv_min_p_mod_r]
xor rax, rax /* clear flags */
mulx rbx, rax, [rip + uintbig_p + 0]
adox r11, rax
mov [rsp+\I0],r11
mov r11,[rsp+\I1]
mulx rcx, rax, [rip + uintbig_p + 8]
adcx r11, rbx
adox r11, rax
mov [rsp+\I1],r11
mov r11,[rsp+\I2]
mulx rbx, rax, [rip + uintbig_p + 16]
adcx r11, rcx
adox r11, rax
mov [rsp+\I2],r11
mov r11,[rsp+\I3]
mulx rcx, rax, [rip + uintbig_p + 24]
adcx r11, rbx
adox r11, rax
mov [rsp+\I3],r11
mov r11,[rsp+\I4]
mulx rbx, rax, [rip + uintbig_p + 32]
adcx r11, rcx
adox r11, rax
mov [rsp+\I4],r11
mov r11,[rsp+\I5]
mulx rcx, rax, [rip + uintbig_p + 40]
adcx r11, rbx
adox r11, rax
mov [rsp+\I5],r11
mov r11,[rsp+\I6]
mulx rbx, rax, [rip + uintbig_p + 48]
adcx r11, rcx
adox r11, rax
mov [rsp+\I6],r11
mov r11,[rsp+\I7]
mulx rcx, rax, [rip + uintbig_p + 56]
adcx r11, rbx
adox r11, rax
mov [rsp+\I7],r11
mov r11,[rsp+\I8]
mulx rbx, rax, [rip + uintbig_p + 64]
adcx r11, rcx
adox r11, rax
mov [rsp+\I8],r11
mov r11,[rsp+\I9]
mulx rcx, rax, [rip + uintbig_p + 72]
adcx r11, rbx
adox r11, rax
mov [rsp+\I9],r11
mov r11,[rsp+\I10]
mulx rbx, rax, [rip + uintbig_p + 80]
adcx r11, rcx
adox r11, rax
mov [rsp+\I10],r11
mov r11,[rsp+\I11]
mulx rcx, rax, [rip + uintbig_p + 88]
adcx r11, rbx
adox r11, rax
mov [rsp+\I11],r11
mov r11,[rsp+\I12]
mulx rbx, rax, [rip + uintbig_p + 96]
adcx r11, rcx
adox r11, rax
mov [rsp+\I12],r11
mov r11,[rsp+\I13]
mulx rcx, rax, [rip + uintbig_p + 104]
adcx r11, rbx
adox r11, rax
mov [rsp+\I13],r11
mov r11,[rsp+\I14]
mulx rbx, rax, [rip + uintbig_p + 112]
adcx r11, rcx
adox r11, rax
mov [rsp+\I14],r11
mov r11,[rsp+\I15]
mulx rcx, rax, [rip + uintbig_p + 120]
adcx r11, rbx
adox r11, rax
mov [rsp+\I15],r11
mov r11,[rsp+\I16]
mulx rbx, rax, [rip + uintbig_p + 128]
adcx r11, rcx
adox r11, rax
mov [rsp+\I16],r11
mov r11,[rsp+\I17]
mulx rcx, rax, [rip + uintbig_p + 136]
adcx r11, rbx
adox r11, rax
mov [rsp+\I17],r11
mov r11,[rsp+\I18]
mulx rbx, rax, [rip + uintbig_p + 144]
adcx r11, rcx
adox r11, rax
mov [rsp+\I18],r11
mov r11,[rsp+\I19]
mulx rcx, rax, [rip + uintbig_p + 152]
adcx r11, rbx
adox r11, rax
mov [rsp+\I19],r11
mov r11,[rsp+\I20]
mulx rbx, rax, [rip + uintbig_p + 160]
adcx r11, rcx
adox r11, rax
mov [rsp+\I20],r11
mov r11,[rsp+\I21]
mulx rcx, rax, [rip + uintbig_p + 168]
adcx r11, rbx
adox r11, rax
mov [rsp+\I21],r11
mov r11,[rsp+\I22]
mulx rbx, rax, [rip + uintbig_p + 176]
adcx r11, rcx
adox r11, rax
mov [rsp+\I22],r11
mov r11,[rsp+\I23]
mulx rcx, rax, [rip + uintbig_p + 184]
adcx r11, rbx
adox r11, rax
mov [rsp+\I23],r11
mov r11,[rsp+\I24]
mulx rbx, rax, [rip + uintbig_p + 192]
adcx r11, rcx
adox r11, rax
mov [rsp+\I24],r11
mov r11,[rsp+\I25]
mulx rcx, rax, [rip + uintbig_p + 200]
adcx r11, rbx
adox r11, rax
mov [rsp+\I25],r11
mov r11,[rsp+\I26]
mulx rbx, rax, [rip + uintbig_p + 208]
adcx r11, rcx
adox r11, rax
mov [rsp+\I26],r11
mov r11,[rsp+\I27]
mulx rcx, rax, [rip + uintbig_p + 216]
adcx r11, rbx
adox r11, rax
mov [rsp+\I27],r11
mov r11,[rsp+\I28]
mulx rbx, rax, [rip + uintbig_p + 224]
adcx r11, rcx
adox r11, rax
mov [rsp+\I28],r11
mov r11,[rsp+\I29]
mulx rcx, rax, [rip + uintbig_p + 232]
adcx r11, rbx
adox r11, rax
mov [rsp+\I29],r11
mov r11,[rsp+\I30]
mulx rbx, rax, [rip + uintbig_p + 240]
adcx r11, rcx
adox r11, rax
mov [rsp+\I30],r11
mov r11,[rsp+\I31]
mulx rcx, rax, [rip + uintbig_p + 248]
adcx r11, rbx
adox r11, rax
mov [rsp+\I31],r11
mov r11,[rsp+\I32]
mov rax, 0
adcx r11, rcx
adox r11, rax
mov [rsp+\I32],r11
mov rdx, [rdi + 8*\k]
xor rax, rax /* clear flags */
mov r11,[rsp+\I0]
mulx rbx, rax, [rsi + 0]
adox r11, rax
mov [rsp+\I0],r11
mov r11,[rsp+\I1]
mulx rcx, rax, [rsi + 8]
adcx r11, rbx
adox r11, rax
mov [rsp+\I1],r11
mov r11,[rsp+\I2]
mulx rbx, rax, [rsi + 16]
adcx r11, rcx
adox r11, rax
mov [rsp+\I2],r11
mov r11,[rsp+\I3]
mulx rcx, rax, [rsi + 24]
adcx r11, rbx
adox r11, rax
mov [rsp+\I3],r11
mov r11,[rsp+\I4]
mulx rbx, rax, [rsi + 32]
adcx r11, rcx
adox r11, rax
mov [rsp+\I4],r11
mov r11,[rsp+\I5]
mulx rcx, rax, [rsi + 40]
adcx r11, rbx
adox r11, rax
mov [rsp+\I5],r11
mov r11,[rsp+\I6]
mulx rbx, rax, [rsi + 48]
adcx r11, rcx
adox r11, rax
mov [rsp+\I6],r11
mov r11,[rsp+\I7]
mulx rcx, rax, [rsi + 56]
adcx r11, rbx
adox r11, rax
mov [rsp+\I7],r11
mov r11,[rsp+\I8]
mulx rbx, rax, [rsi + 64]
adcx r11, rcx
adox r11, rax
mov [rsp+\I8],r11
mov r11,[rsp+\I9]
mulx rcx, rax, [rsi + 72]
adcx r11, rbx
adox r11, rax
mov [rsp+\I9],r11
mov r11,[rsp+\I10]
mulx rbx, rax, [rsi + 80]
adcx r11, rcx
adox r11, rax
mov [rsp+\I10],r11
mov r11,[rsp+\I11]
mulx rcx, rax, [rsi + 88]
adcx r11, rbx
adox r11, rax
mov [rsp+\I11],r11
mov r11,[rsp+\I12]
mulx rbx, rax, [rsi + 96]
adcx r11, rcx
adox r11, rax
mov [rsp+\I12],r11
mov r11,[rsp+\I13]
mulx rcx, rax, [rsi + 104]
adcx r11, rbx
adox r11, rax
mov [rsp+\I13],r11
mov r11,[rsp+\I14]
mulx rbx, rax, [rsi + 112]
adcx r11, rcx
adox r11, rax
mov [rsp+\I14],r11
mov r11,[rsp+\I15]
mulx rcx, rax, [rsi + 120]
adcx r11, rbx
adox r11, rax
mov [rsp+\I15],r11
mov r11,[rsp+\I16]
mulx rbx, rax, [rsi + 128]
adcx r11, rcx
adox r11, rax
mov [rsp+\I16],r11
mov r11,[rsp+\I17]
mulx rcx, rax, [rsi + 136]
adcx r11, rbx
adox r11, rax
mov [rsp+\I17],r11
mov r11,[rsp+\I18]
mulx rbx, rax, [rsi + 144]
adcx r11, rcx
adox r11, rax
mov [rsp+\I18],r11
mov r11,[rsp+\I19]
mulx rcx, rax, [rsi + 152]
adcx r11, rbx
adox r11, rax
mov [rsp+\I19],r11
mov r11,[rsp+\I20]
mulx rbx, rax, [rsi + 160]
adcx r11, rcx
adox r11, rax
mov [rsp+\I20],r11
mov r11,[rsp+\I21]
mulx rcx, rax, [rsi + 168]
adcx r11, rbx
adox r11, rax
mov [rsp+\I21],r11
mov r11,[rsp+\I22]
mulx rbx, rax, [rsi + 176]
adcx r11, rcx
adox r11, rax
mov [rsp+\I22],r11
mov r11,[rsp+\I23]
mulx rcx, rax, [rsi + 184]
adcx r11, rbx
adox r11, rax
mov [rsp+\I23],r11
mov r11,[rsp+\I24]
mulx rbx, rax, [rsi + 192]
adcx r11, rcx
adox r11, rax
mov [rsp+\I24],r11
mov r11,[rsp+\I25]
mulx rcx, rax, [rsi + 200]
adcx r11, rbx
adox r11, rax
mov [rsp+\I25],r11
mov r11,[rsp+\I26]
mulx rbx, rax, [rsi + 208]
adcx r11, rcx
adox r11, rax
mov [rsp+\I26],r11
mov r11,[rsp+\I27]
mulx rcx, rax, [rsi + 216]
adcx r11, rbx
adox r11, rax
mov [rsp+\I27],r11
mov r11,[rsp+\I28]
mulx rbx, rax, [rsi + 224]
adcx r11, rcx
adox r11, rax
mov [rsp+\I28],r11
mov r11,[rsp+\I29]
mulx rcx, rax, [rsi + 232]
adcx r11, rbx
adox r11, rax
mov [rsp+\I29],r11
mov r11,[rsp+\I30]
mulx rbx, rax, [rsi + 240]
adcx r11, rcx
adox r11, rax
mov [rsp+\I30],r11
mov r11,[rsp+\I31]
mulx rcx, rax, [rsi + 248]
adcx r11, rbx
adox r11, rax
mov [rsp+\I31],r11
mov r11,[rsp+\I32]
mov rax, 0
adcx r11, rcx
adox r11, rax
mov [rsp+\I32],r11
.endm
MULSTEP 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0
MULSTEP 1, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8
MULSTEP 2, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16
MULSTEP 3, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24
MULSTEP 4, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32
MULSTEP 5, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40
MULSTEP 6, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48
MULSTEP 7, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56
MULSTEP 8, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64
MULSTEP 9, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72
MULSTEP 10, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80
MULSTEP 11, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88
MULSTEP 12, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96
MULSTEP 13, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104
MULSTEP 14, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112
MULSTEP 15, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120
MULSTEP 16, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128
MULSTEP 17, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136
MULSTEP 18, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144
MULSTEP 19, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152
MULSTEP 20, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160
MULSTEP 21, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168
MULSTEP 22, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176
MULSTEP 23, 192, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184
MULSTEP 24, 200, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192
MULSTEP 25, 208, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200
MULSTEP 26, 216, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208
MULSTEP 27, 224, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216
MULSTEP 28, 232, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224
MULSTEP 29, 240, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232
MULSTEP 30, 248, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240
MULSTEP 31, 256, 0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248
mov rdi,[rsp+264]
mov r11,[rsp+0]
mov [rdi+0],r11
mov r11,[rsp+8]
mov [rdi+8],r11
mov r11,[rsp+16]
mov [rdi+16],r11
mov r11,[rsp+24]
mov [rdi+24],r11
mov r11,[rsp+32]
mov [rdi+32],r11
mov r11,[rsp+40]
mov [rdi+40],r11
mov r11,[rsp+48]
mov [rdi+48],r11
mov r11,[rsp+56]
mov [rdi+56],r11
mov r11,[rsp+64]
mov [rdi+64],r11
mov r11,[rsp+72]
mov [rdi+72],r11
mov r11,[rsp+80]
mov [rdi+80],r11
mov r11,[rsp+88]
mov [rdi+88],r11
mov r11,[rsp+96]
mov [rdi+96],r11
mov r11,[rsp+104]
mov [rdi+104],r11
mov r11,[rsp+112]
mov [rdi+112],r11
mov r11,[rsp+120]
mov [rdi+120],r11
mov r11,[rsp+128]
mov [rdi+128],r11
mov r11,[rsp+136]
mov [rdi+136],r11
mov r11,[rsp+144]
mov [rdi+144],r11
mov r11,[rsp+152]
mov [rdi+152],r11
mov r11,[rsp+160]
mov [rdi+160],r11
mov r11,[rsp+168]
mov [rdi+168],r11
mov r11,[rsp+176]
mov [rdi+176],r11
mov r11,[rsp+184]
mov [rdi+184],r11
mov r11,[rsp+192]
mov [rdi+192],r11
mov r11,[rsp+200]
mov [rdi+200],r11
mov r11,[rsp+208]
mov [rdi+208],r11
mov r11,[rsp+216]
mov [rdi+216],r11
mov r11,[rsp+224]
mov [rdi+224],r11
mov r11,[rsp+232]
mov [rdi+232],r11
mov r11,[rsp+240]
mov [rdi+240],r11
mov r11,[rsp+248]
mov [rdi+248],r11
add rsp,272
pop rbx
pop rbp
jmp .reduce_once
.global fp_sq1
fp_sq1:
mov rsi, rdi
.global fp_sq2
fp_sq2:
/* TODO implement optimized Montgomery squaring */
mov rdx, rsi
addq [fp_sq_count+rip],1
jmp fp_mul3