| /* | | /* |
| // references: | | // references: |
| // https://blog.cloudflare.com/go-crypto-bridging-the-performance-gap/ | | // https://blog.cloudflare.com/go-crypto-bridging-the-performance-gap/ |
| // https://github.com/vkrasnov/openssl/blob/vlad/inv_ord/crypto/ec/asm/ecp_nistz256-x86_64.pl | | // https://github.com/vkrasnov/openssl/blob/vlad/inv_ord/crypto/ec/asm/ecp_nistz256-x86_64.pl |
| // https://go-review.googlesource.com/#/c/8968/ | | // https://go-review.googlesource.com/#/c/8968/ |
| // https://groups.google.com/forum/#!msg/golang-codereviews/m5QTnSUZU6c/Q5RUAdefWUwJ | | // https://groups.google.com/forum/#!msg/golang-codereviews/m5QTnSUZU6c/Q5RUAdefWUwJ |
| // https://github.com/openssl/openssl/pull/263/files | | // https://github.com/openssl/openssl/pull/263/files |
| // | | // |
| // | | // |
| Portions Copyright 2015 The Go Authors. All rights reserved. | | Portions Copyright 2015 The Go Authors. All rights reserved. |
| Use of this source code is governed by the following | | Use of this source code is governed by the following |
| BSD-style license (from the go distributions LICENSE file): | | BSD-style license (from the go distributions LICENSE file): |
| | | |
| Redistribution and use in source and binary forms, with or without | | Redistribution and use in source and binary forms, with or without |
| modification, are permitted provided that the following conditions are | | modification, are permitted provided that the following conditions are |
| met: | | met: |
| | | |
| 18 | * Redistributions of source code must retain the above copyright | | 18 | * Redistributions of source code must retain the above copyright |
| notice, this list of conditions and the following disclaimer. | | notice, this list of conditions and the following disclaimer. |
| 20 | * Redistributions in binary form must reproduce the above | | 20 | * Redistributions in binary form must reproduce the above |
| copyright notice, this list of conditions and the following disclaimer | | copyright notice, this list of conditions and the following disclaimer |
| in the documentation and/or other materials provided with the | | in the documentation and/or other materials provided with the |
| distribution. | | distribution. |
| 24 | * Neither the name of Google Inc. nor the names of its | | 24 | * Neither the name of Google Inc. nor the names of its |
| contributors may be used to endorse or promote products derived from | | contributors may be used to endorse or promote products derived from |
| this software without specific prior written permission. | | this software without specific prior written permission. |
| | | |
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| | | |
| ############################################################################## | | ############################################################################## |
| # # | | # # |
| # Portions Copyright (c) 2015 Intel Corporation # | | # Portions Copyright (c) 2015 Intel Corporation # |
| # Portions Copyright (c) 2015 CloudFlare, Inc. # | | # Portions Copyright (c) 2015 CloudFlare, Inc. # |
| # All rights reserved. # | | # All rights reserved. # |
| # # | | # # |
| # This software is made available to you under your choice of the # | | # This software is made available to you under your choice of the # |
| # Apache V.2.0 and/or BSD license below: # | | # Apache V.2.0 and/or BSD license below: # |
| # # | | # # |
| ############################################################################## | | ############################################################################## |
| # # | | # # |
| # Licensed under the Apache License, Version 2.0 (the "License"); # | | # Licensed under the Apache License, Version 2.0 (the "License"); # |
| # you may not use this file except in compliance with the License. # | | # you may not use this file except in compliance with the License. # |
| # You may obtain a copy of the License at # | | # You may obtain a copy of the License at # |
| # # | | # # |
| # http://www.apache.org/licenses/LICENSE-2.0 # | | # http://www.apache.org/licenses/LICENSE-2.0 # |
| # # | | # # |
| # Unless required by applicable law or agreed to in writing, software # | | # Unless required by applicable law or agreed to in writing, software # |
| # distributed under the License is distributed on an "AS IS" BASIS, # | | # distributed under the License is distributed on an "AS IS" BASIS, # |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # | | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # |
| # See the License for the specific language governing permissions and # | | # See the License for the specific language governing permissions and # |
| # limitations under the License. # | | # limitations under the License. # |
| # # | | # # |
| ############################################################################## | | ############################################################################## |
| # # | | # # |
| # Copyright (c) 2015 Intel Corporation # | | # Copyright (c) 2015 Intel Corporation # |
| # Copyright (c) 2015 CloudFlare, Inc. # | | # Copyright (c) 2015 CloudFlare, Inc. # |
| # All rights reserved. # | | # All rights reserved. # |
| # # | | # # |
| # This software is made available to you under your choice of the # | | # This software is made available to you under your choice of the # |
| # Apache V.2.0 and/or BSD license below: # | | # Apache V.2.0 and/or BSD license below: # |
| # # | | # # |
| ############################################################################## | | ############################################################################## |
| # # | | # # |
| # Licensed under the Apache License, Version 2.0 (the "License"); # | | # Licensed under the Apache License, Version 2.0 (the "License"); # |
| # you may not use this file except in compliance with the License. # | | # you may not use this file except in compliance with the License. # |
| # You may obtain a copy of the License at # | | # You may obtain a copy of the License at # |
| # # | | # # |
| # http://www.apache.org/licenses/LICENSE-2.0 # | | # http://www.apache.org/licenses/LICENSE-2.0 # |
| # # | | # # |
| # Unless required by applicable law or agreed to in writing, software # | | # Unless required by applicable law or agreed to in writing, software # |
| # distributed under the License is distributed on an "AS IS" BASIS, # | | # distributed under the License is distributed on an "AS IS" BASIS, # |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # | | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # |
| # See the License for the specific language governing permissions and # | | # See the License for the specific language governing permissions and # |
| # limitations under the License. # | | # limitations under the License. # |
| # # | | # # |
| ############################################################################## | | ############################################################################## |
| # # | | # # |
| # Redistribution and use in source and binary forms, with or without # | | # Redistribution and use in source and binary forms, with or without # |
| # modification, are permitted provided that the following conditions are # | | # modification, are permitted provided that the following conditions are # |
| # met: # | | # met: # |
| # # | | # # |
| # # Redistributions of source code must retain the above copyright # | | # # Redistributions of source code must retain the above copyright # |
| # notice, this list of conditions and the following disclaimer. # | | # notice, this list of conditions and the following disclaimer. # |
| # # | | # # |
| # # Redistributions in binary form must reproduce the above copyright # | | # # Redistributions in binary form must reproduce the above copyright # |
| # notice, this list of conditions and the following disclaimer in the # | | # notice, this list of conditions and the following disclaimer in the # |
| # documentation and/or other materials provided with the # | | # documentation and/or other materials provided with the # |
| # distribution. # | | # distribution. # |
| # # | | # # |
| # # Neither the name of the copyright holders nor the names of its # | | # # Neither the name of the copyright holders nor the names of its # |
| # contributors may be used to endorse or promote products derived from # | | # contributors may be used to endorse or promote products derived from # |
| # this software without specific prior written permission. # | | # this software without specific prior written permission. # |
| # # | | # # |
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # | | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # |
| # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED # | | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED # |
| # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR# | | # TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR# |
| # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR # | | # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR # |
| # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # | | # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # |
| # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # | | # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # |
| # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # | | # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # |
| # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # | | # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # |
| # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # | | # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # |
| # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # | | # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # |
| # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # | | # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # |
| # # | | # # |
| ############################################################################## | | ############################################################################## |
| # # | | # # |
| # Developers and authors: # | | # Developers and authors: # |
| # Shay Gueron (1, 2), and Vlad Krasnov (1, 3) # | | # Shay Gueron (1, 2), and Vlad Krasnov (1, 3) # |
| # (1) Intel Corporation, Israel Development Center # | | # (1) Intel Corporation, Israel Development Center # |
| # (2) University of Haifa # | | # (2) University of Haifa # |
| # (3) CloudFlare, Inc. # | | # (3) CloudFlare, Inc. # |
| # Reference: # | | # Reference: # |
| # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with# | | # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with# |
| # 256 Bit Primes" # | | # 256 Bit Primes" # |
| # https://eprint.iacr.org/2013/816.pdf # | | # https://eprint.iacr.org/2013/816.pdf # |
| ############################################################################## | | ############################################################################## |
| | | |
| # Further optimization by <appro@openssl.org>: | | # Further optimization by <appro@openssl.org>: |
| # | | # |
| # this/original with/without -DECP_NISTZ256_ASM(*) | | # this/original with/without -DECP_NISTZ256_ASM(*) |
| # Opteron +12-49% +110-150% | | # Opteron +12-49% +110-150% |
| # Bulldozer +14-45% +175-210% | | # Bulldozer +14-45% +175-210% |
| # P4 +18-46% n/a :-( | | # P4 +18-46% n/a :-( |
| # Westmere +12-34% +80-87% | | # Westmere +12-34% +80-87% |
| # Sandy Bridge +9-35% +110-120% | | # Sandy Bridge +9-35% +110-120% |
| # Ivy Bridge +9-35% +110-125% | | # Ivy Bridge +9-35% +110-125% |
| # Haswell +8-37% +140-160% | | # Haswell +8-37% +140-160% |
| # Broadwell +18-58% +145-210% | | # Broadwell +18-58% +145-210% |
| # Atom +15-50% +130-180% | | # Atom +15-50% +130-180% |
| # VIA Nano +43-160% +300-480% | | # VIA Nano +43-160% +300-480% |
| # | | # |
| # (*) "without -DECP_NISTZ256_ASM" refers to build with | | # (*) "without -DECP_NISTZ256_ASM" refers to build with |
| # "enable-ec_nistp_64_gcc_128"; | | # "enable-ec_nistp_64_gcc_128"; |
| # | | # |
| # Ranges denote minimum and maximum improvement coefficients depending | | # Ranges denote minimum and maximum improvement coefficients depending |
| # on benchmark. Lower coefficients are for ECDSA sign, relatively fastest | | # on benchmark. Lower coefficients are for ECDSA sign, relatively fastest |
| # server-side operation. Keep in mind that +100% means 2x improvement. | | # server-side operation. Keep in mind that +100% means 2x improvement. |
| */ | | */ |
| 150 |
| | | |
| 151 |
| | | |
| // This file contains constant-time, 64-bit assembly implementation of | | // This file contains constant-time, 64-bit assembly implementation of |
| // P256. The optimizations performed here are described in detail in: | | // P256. The optimizations performed here are described in detail in: |
| // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with | | // S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with |
| // 256-bit primes" | | // 256-bit primes" |
| | | |
| #include "textflag.h" | | #include "textflag.h" |
| | | |
| #define res_ptr DI | | #define res_ptr DI |
| #define x_ptr SI | | #define x_ptr SI |
| #define y_ptr CX | | #define y_ptr CX |
| | | |
| #define acc0 R8 | | #define acc0 R8 |
| #define acc1 R9 | | #define acc1 R9 |
| #define acc2 R10 | | #define acc2 R10 |
| #define acc3 R11 | | #define acc3 R11 |
| #define acc4 R12 | | #define acc4 R12 |
| #define acc5 R13 | | #define acc5 R13 |
| #define t0 R14 | | #define t0 R14 |
| #define t1 R15 | | #define t1 R15 |
| | | |
| DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff | | DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff |
| DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 | | DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 |
| DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f | | DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f |
| DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 | | DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 |
| DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 | | DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 |
| DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff | | DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff |
| DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 | | DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 |
| DATA p256one<>+0x00(SB)/8, $0x0000000000000001 | | DATA p256one<>+0x00(SB)/8, $0x0000000000000001 |
| DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 | | DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 |
| DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff | | DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff |
| DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe | | DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe |
| GLOBL p256const0<>(SB), 8, $8 | | GLOBL p256const0<>(SB), 8, $8 |
| GLOBL p256const1<>(SB), 8, $8 | | GLOBL p256const1<>(SB), 8, $8 |
| GLOBL p256ordK0<>(SB), 8, $8 | | GLOBL p256ordK0<>(SB), 8, $8 |
| GLOBL p256ord<>(SB), 8, $32 | | GLOBL p256ord<>(SB), 8, $32 |
| GLOBL p256one<>(SB), 8, $32 | | GLOBL p256one<>(SB), 8, $32 |
| | | |
| 189 | /* ---------------------------------------*/ | | 187 | // --------------------------------------- |
| // func p256LittleToBig(res []byte, in []uint64) | | // func p256LittleToBig(res []byte, in []uint64) |
| 191 | TEXT ·p256LittleToBig(SB),NOSPLIT,$0 | | 189 | TEXT ·p256LittleToBig(SB), NOSPLIT, $0 |
| JMP ·p256BigToLittle(SB) | | JMP ·p256BigToLittle(SB) |
| | | | 191 |
|
| 193 | /* ---------------------------------------*/ | | 192 | // --------------------------------------- |
| // func p256BigToLittle(res []uint64, in []byte) | | // func p256BigToLittle(res []uint64, in []byte) |
| 195 | TEXT ·p256BigToLittle(SB),NOSPLIT,$0 | | 194 | TEXT ·p256BigToLittle(SB), NOSPLIT, $0 |
| MOVQ z+0(FP), res_ptr | | MOVQ z+0(FP), res_ptr |
| MOVQ x+24(FP), x_ptr | | MOVQ x+24(FP), x_ptr |
| | | |
| MOVQ (8*0)(x_ptr), acc0 | | MOVQ (8*0)(x_ptr), acc0 |
| MOVQ (8*1)(x_ptr), acc1 | | MOVQ (8*1)(x_ptr), acc1 |
| MOVQ (8*2)(x_ptr), acc2 | | MOVQ (8*2)(x_ptr), acc2 |
| MOVQ (8*3)(x_ptr), acc3 | | MOVQ (8*3)(x_ptr), acc3 |
| | | |
| BSWAPQ acc0 | | BSWAPQ acc0 |
| BSWAPQ acc1 | | BSWAPQ acc1 |
| BSWAPQ acc2 | | BSWAPQ acc2 |
| BSWAPQ acc3 | | BSWAPQ acc3 |
| | | |
| MOVQ acc3, (8*0)(res_ptr) | | MOVQ acc3, (8*0)(res_ptr) |
| MOVQ acc2, (8*1)(res_ptr) | | MOVQ acc2, (8*1)(res_ptr) |
| MOVQ acc1, (8*2)(res_ptr) | | MOVQ acc1, (8*2)(res_ptr) |
| MOVQ acc0, (8*3)(res_ptr) | | MOVQ acc0, (8*3)(res_ptr) |
| | | |
| RET | | RET |
| | | | 214 |
|
| 215 | /* ---------------------------------------*/ | | 215 | // --------------------------------------- |
| // func p256MovCond(res, a, b []uint64, cond int) | | // func p256MovCond(res, a, b []uint64, cond int) |
| // If cond == 0 res=b, else res=a | | // If cond == 0 res=b, else res=a |
| 218 | TEXT ·p256MovCond(SB),NOSPLIT,$0 | | 218 | TEXT ·p256MovCond(SB), NOSPLIT, $0 |
| MOVQ res+0(FP), res_ptr | | MOVQ res+0(FP), res_ptr |
| MOVQ a+24(FP), x_ptr | | MOVQ a+24(FP), x_ptr |
| MOVQ b+48(FP), y_ptr | | MOVQ b+48(FP), y_ptr |
| MOVL cond+72(FP), t0 | | MOVL cond+72(FP), t0 |
| | | |
| 224 | MOVL t0, X12 | | 224 | MOVL t0, X12 |
| 225 | PXOR X13, X13 | | 225 | PXOR X13, X13 |
| 226 | PSHUFB X13, X12 | | 226 | PSHUFB X13, X12 |
| PCMPEQB X13, X12 | | PCMPEQB X13, X12 |
| | | |
| MOVOU X12, X0 | | MOVOU X12, X0 |
| PANDN (16*0)(x_ptr), X0 | | PANDN (16*0)(x_ptr), X0 |
| MOVOU X12, X1 | | MOVOU X12, X1 |
| PANDN (16*1)(x_ptr), X1 | | PANDN (16*1)(x_ptr), X1 |
| MOVOU X12, X2 | | MOVOU X12, X2 |
| PANDN (16*2)(x_ptr), X2 | | PANDN (16*2)(x_ptr), X2 |
| MOVOU X12, X3 | | MOVOU X12, X3 |
| PANDN (16*3)(x_ptr), X3 | | PANDN (16*3)(x_ptr), X3 |
| MOVOU X12, X4 | | MOVOU X12, X4 |
| PANDN (16*4)(x_ptr), X4 | | PANDN (16*4)(x_ptr), X4 |
| MOVOU X12, X5 | | MOVOU X12, X5 |
| PANDN (16*5)(x_ptr), X5 | | PANDN (16*5)(x_ptr), X5 |
| | | |
| MOVOU (16*0)(y_ptr), X6 | | MOVOU (16*0)(y_ptr), X6 |
| MOVOU (16*1)(y_ptr), X7 | | MOVOU (16*1)(y_ptr), X7 |
| MOVOU (16*2)(y_ptr), X8 | | MOVOU (16*2)(y_ptr), X8 |
| MOVOU (16*3)(y_ptr), X9 | | MOVOU (16*3)(y_ptr), X9 |
| MOVOU (16*4)(y_ptr), X10 | | MOVOU (16*4)(y_ptr), X10 |
| MOVOU (16*5)(y_ptr), X11 | | MOVOU (16*5)(y_ptr), X11 |
| | | |
| PAND X12, X6 | | PAND X12, X6 |
| PAND X12, X7 | | PAND X12, X7 |
| PAND X12, X8 | | PAND X12, X8 |
| PAND X12, X9 | | PAND X12, X9 |
| PAND X12, X10 | | PAND X12, X10 |
| PAND X12, X11 | | PAND X12, X11 |
| | | |
| PXOR X6, X0 | | PXOR X6, X0 |
| PXOR X7, X1 | | PXOR X7, X1 |
| PXOR X8, X2 | | PXOR X8, X2 |
| PXOR X9, X3 | | PXOR X9, X3 |
| PXOR X10, X4 | | PXOR X10, X4 |
| PXOR X11, X5 | | PXOR X11, X5 |
| | | |
| MOVOU X0, (16*0)(res_ptr) | | MOVOU X0, (16*0)(res_ptr) |
| MOVOU X1, (16*1)(res_ptr) | | MOVOU X1, (16*1)(res_ptr) |
| MOVOU X2, (16*2)(res_ptr) | | MOVOU X2, (16*2)(res_ptr) |
| MOVOU X3, (16*3)(res_ptr) | | MOVOU X3, (16*3)(res_ptr) |
| MOVOU X4, (16*4)(res_ptr) | | MOVOU X4, (16*4)(res_ptr) |
| MOVOU X5, (16*5)(res_ptr) | | MOVOU X5, (16*5)(res_ptr) |
| | | |
| RET | | RET |
| | | | 271 |
|
| 271 | /* ---------------------------------------*/ | | 272 | // --------------------------------------- |
| // func p256NegCond(val []uint64, cond int) | | // func p256NegCond(val []uint64, cond int) |
| 273 | TEXT ·p256NegCond(SB),NOSPLIT,$0 | | 274 | TEXT ·p256NegCond(SB), NOSPLIT, $0 |
| MOVQ val+0(FP), res_ptr | | MOVQ val+0(FP), res_ptr |
| MOVL cond+24(FP), t0 | | MOVL cond+24(FP), t0 |
| | | | 277 |
|
| // acc = poly | | // acc = poly |
| MOVQ $-1, acc0 | | MOVQ $-1, acc0 |
| MOVQ p256const0<>(SB), acc1 | | MOVQ p256const0<>(SB), acc1 |
| MOVQ $0, acc2 | | MOVQ $0, acc2 |
| MOVQ p256const1<>(SB), acc3 | | MOVQ p256const1<>(SB), acc3 |
| | | | 283 |
|
| // Load the original value | | // Load the original value |
| MOVQ (8*0)(res_ptr), acc5 | | MOVQ (8*0)(res_ptr), acc5 |
| MOVQ (8*1)(res_ptr), x_ptr | | MOVQ (8*1)(res_ptr), x_ptr |
| MOVQ (8*2)(res_ptr), y_ptr | | MOVQ (8*2)(res_ptr), y_ptr |
| MOVQ (8*3)(res_ptr), t1 | | MOVQ (8*3)(res_ptr), t1 |
| | | | 289 |
|
| // Speculatively subtract | | // Speculatively subtract |
| SUBQ acc5, acc0 | | SUBQ acc5, acc0 |
| SBBQ x_ptr, acc1 | | SBBQ x_ptr, acc1 |
| SBBQ y_ptr, acc2 | | SBBQ y_ptr, acc2 |
| SBBQ t1, acc3 | | SBBQ t1, acc3 |
| | | | 295 |
|
| // If condition is 0, keep original value | | // If condition is 0, keep original value |
| 292 | TESTQ t0, t0 | | 297 | TESTQ t0, t0 |
| CMOVQEQ acc5, acc0 | | CMOVQEQ acc5, acc0 |
| CMOVQEQ x_ptr, acc1 | | CMOVQEQ x_ptr, acc1 |
| CMOVQEQ y_ptr, acc2 | | CMOVQEQ y_ptr, acc2 |
| CMOVQEQ t1, acc3 | | CMOVQEQ t1, acc3 |
| | | | 302 |
|
| // Store result | | // Store result |
| MOVQ acc0, (8*0)(res_ptr) | | MOVQ acc0, (8*0)(res_ptr) |
| MOVQ acc1, (8*1)(res_ptr) | | MOVQ acc1, (8*1)(res_ptr) |
| MOVQ acc2, (8*2)(res_ptr) | | MOVQ acc2, (8*2)(res_ptr) |
| MOVQ acc3, (8*3)(res_ptr) | | MOVQ acc3, (8*3)(res_ptr) |
| | | |
| RET | | RET |
| | | | 310 |
|
| 304 | /* ---------------------------------------*/ | | 311 | // --------------------------------------- |
| // func p256Sqr(res, in []uint64) | | // func p256Sqr(res, in []uint64) |
| 306 | TEXT ·p256Sqr(SB),NOSPLIT,$0 | | 313 | TEXT ·p256Sqr(SB), NOSPLIT, $0 |
| MOVQ res+0(FP), res_ptr | | MOVQ res+0(FP), res_ptr |
| MOVQ in+24(FP), x_ptr | | MOVQ in+24(FP), x_ptr |
| | | | 316 |
|
| // y[1:] * y[0] | | // y[1:] * y[0] |
| MOVQ (8*0)(x_ptr), t0 | | MOVQ (8*0)(x_ptr), t0 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| MOVQ AX, acc1 | | MOVQ AX, acc1 |
| MOVQ DX, acc2 | | MOVQ DX, acc2 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc3 | | MOVQ DX, acc3 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc4 | | MOVQ DX, acc4 |
| | | | 336 |
|
| // y[2:] * y[1] | | // y[2:] * y[1] |
| MOVQ (8*1)(x_ptr), t0 | | MOVQ (8*1)(x_ptr), t0 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc5 | | MOVQ DX, acc5 |
| | | | 353 |
|
| // y[3] * y[2] | | // y[3] * y[2] |
| MOVQ (8*2)(x_ptr), t0 | | MOVQ (8*2)(x_ptr), t0 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc5 | | ADDQ AX, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, y_ptr | | MOVQ DX, y_ptr |
| XORQ t1, t1 | | XORQ t1, t1 |
| | | | 363 |
|
| // *2 | | // *2 |
| ADDQ acc1, acc1 | | ADDQ acc1, acc1 |
| ADCQ acc2, acc2 | | ADCQ acc2, acc2 |
| ADCQ acc3, acc3 | | ADCQ acc3, acc3 |
| ADCQ acc4, acc4 | | ADCQ acc4, acc4 |
| ADCQ acc5, acc5 | | ADCQ acc5, acc5 |
| ADCQ y_ptr, y_ptr | | ADCQ y_ptr, y_ptr |
| ADCQ $0, t1 | | ADCQ $0, t1 |
| | | | 372 |
|
| // Missing products | | // Missing products |
| MOVQ (8*0)(x_ptr), AX | | MOVQ (8*0)(x_ptr), AX |
| MULQ AX | | MULQ AX |
| MOVQ AX, acc0 | | MOVQ AX, acc0 |
| MOVQ DX, t0 | | MOVQ DX, t0 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ AX | | MULQ AX |
| ADDQ t0, acc1 | | ADDQ t0, acc1 |
| ADCQ AX, acc2 | | ADCQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t0 | | MOVQ DX, t0 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ AX | | MULQ AX |
| ADDQ t0, acc3 | | ADDQ t0, acc3 |
| ADCQ AX, acc4 | | ADCQ AX, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t0 | | MOVQ DX, t0 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ AX | | MULQ AX |
| ADDQ t0, acc5 | | ADDQ t0, acc5 |
| ADCQ AX, y_ptr | | ADCQ AX, y_ptr |
| ADCQ DX, t1 | | ADCQ DX, t1 |
| MOVQ t1, x_ptr | | MOVQ t1, x_ptr |
| | | | 399 |
|
| // First reduction step | | // First reduction step |
| MOVQ acc0, AX | | MOVQ acc0, AX |
| MOVQ acc0, t1 | | MOVQ acc0, t1 |
| SHLQ $32, acc0 | | SHLQ $32, acc0 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc0, acc1 | | ADDQ acc0, acc1 |
| ADCQ t1, acc2 | | ADCQ t1, acc2 |
| ADCQ AX, acc3 | | ADCQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc0 | | MOVQ DX, acc0 |
| | | | 411 |
|
| // Second reduction step | | // Second reduction step |
| MOVQ acc1, AX | | MOVQ acc1, AX |
| MOVQ acc1, t1 | | MOVQ acc1, t1 |
| SHLQ $32, acc1 | | SHLQ $32, acc1 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc1, acc2 | | ADDQ acc1, acc2 |
| ADCQ t1, acc3 | | ADCQ t1, acc3 |
| ADCQ AX, acc0 | | ADCQ AX, acc0 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc1 | | MOVQ DX, acc1 |
| | | | 423 |
|
| // Third reduction step | | // Third reduction step |
| MOVQ acc2, AX | | MOVQ acc2, AX |
| MOVQ acc2, t1 | | MOVQ acc2, t1 |
| SHLQ $32, acc2 | | SHLQ $32, acc2 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc2, acc3 | | ADDQ acc2, acc3 |
| ADCQ t1, acc0 | | ADCQ t1, acc0 |
| ADCQ AX, acc1 | | ADCQ AX, acc1 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc2 | | MOVQ DX, acc2 |
| | | | 435 |
|
| // Last reduction step | | // Last reduction step |
| XORQ t0, t0 | | XORQ t0, t0 |
| MOVQ acc3, AX | | MOVQ acc3, AX |
| MOVQ acc3, t1 | | MOVQ acc3, t1 |
| SHLQ $32, acc3 | | SHLQ $32, acc3 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc3, acc0 | | ADDQ acc3, acc0 |
| ADCQ t1, acc1 | | ADCQ t1, acc1 |
| ADCQ AX, acc2 | | ADCQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc3 | | MOVQ DX, acc3 |
| | | | 448 |
|
| // Add bits [511:256] of the sqr result | | // Add bits [511:256] of the sqr result |
| ADCQ acc4, acc0 | | ADCQ acc4, acc0 |
| ADCQ acc5, acc1 | | ADCQ acc5, acc1 |
| ADCQ y_ptr, acc2 | | ADCQ y_ptr, acc2 |
| ADCQ x_ptr, acc3 | | ADCQ x_ptr, acc3 |
| ADCQ $0, t0 | | ADCQ $0, t0 |
| | | |
| MOVQ acc0, acc4 | | MOVQ acc0, acc4 |
| MOVQ acc1, acc5 | | MOVQ acc1, acc5 |
| MOVQ acc2, y_ptr | | MOVQ acc2, y_ptr |
| MOVQ acc3, t1 | | MOVQ acc3, t1 |
| | | | 460 |
|
| // Subtract p256 | | // Subtract p256 |
| SUBQ $-1, acc0 | | SUBQ $-1, acc0 |
| 445 | SBBQ p256const0<>(SB) ,acc1 | | 463 | SBBQ p256const0<>(SB), acc1 |
| SBBQ $0, acc2 | | SBBQ $0, acc2 |
| SBBQ p256const1<>(SB), acc3 | | SBBQ p256const1<>(SB), acc3 |
| SBBQ $0, t0 | | SBBQ $0, t0 |
| | | |
| CMOVQCS acc4, acc0 | | CMOVQCS acc4, acc0 |
| CMOVQCS acc5, acc1 | | CMOVQCS acc5, acc1 |
| CMOVQCS y_ptr, acc2 | | CMOVQCS y_ptr, acc2 |
| CMOVQCS t1, acc3 | | CMOVQCS t1, acc3 |
| | | |
| MOVQ acc0, (8*0)(res_ptr) | | MOVQ acc0, (8*0)(res_ptr) |
| MOVQ acc1, (8*1)(res_ptr) | | MOVQ acc1, (8*1)(res_ptr) |
| MOVQ acc2, (8*2)(res_ptr) | | MOVQ acc2, (8*2)(res_ptr) |
| MOVQ acc3, (8*3)(res_ptr) | | MOVQ acc3, (8*3)(res_ptr) |
| | | |
| RET | | RET |
| | | | 479 |
|
| 461 | /* ---------------------------------------*/ | | 480 | // --------------------------------------- |
| // func p256Mul(res, in1, in2 []uint64) | | // func p256Mul(res, in1, in2 []uint64) |
| 463 | TEXT ·p256Mul(SB),NOSPLIT,$0 | | 482 | TEXT ·p256Mul(SB), NOSPLIT, $0 |
| MOVQ res+0(FP), res_ptr | | MOVQ res+0(FP), res_ptr |
| MOVQ in1+24(FP), x_ptr | | MOVQ in1+24(FP), x_ptr |
| MOVQ in2+48(FP), y_ptr | | MOVQ in2+48(FP), y_ptr |
| | | | 486 |
|
| // x * y[0] | | // x * y[0] |
| MOVQ (8*0)(y_ptr), t0 | | MOVQ (8*0)(y_ptr), t0 |
| | | |
| MOVQ (8*0)(x_ptr), AX | | MOVQ (8*0)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| MOVQ AX, acc0 | | MOVQ AX, acc0 |
| MOVQ DX, acc1 | | MOVQ DX, acc1 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc1 | | ADDQ AX, acc1 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc2 | | MOVQ DX, acc2 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc3 | | MOVQ DX, acc3 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc4 | | MOVQ DX, acc4 |
| XORQ acc5, acc5 | | XORQ acc5, acc5 |
| | | | 513 |
|
| // First reduction step | | // First reduction step |
| MOVQ acc0, AX | | MOVQ acc0, AX |
| MOVQ acc0, t1 | | MOVQ acc0, t1 |
| SHLQ $32, acc0 | | SHLQ $32, acc0 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc0, acc1 | | ADDQ acc0, acc1 |
| ADCQ t1, acc2 | | ADCQ t1, acc2 |
| ADCQ AX, acc3 | | ADCQ AX, acc3 |
| ADCQ DX, acc4 | | ADCQ DX, acc4 |
| ADCQ $0, acc5 | | ADCQ $0, acc5 |
| XORQ acc0, acc0 | | XORQ acc0, acc0 |
| | | | 526 |
|
| // x * y[1] | | // x * y[1] |
| MOVQ (8*1)(y_ptr), t0 | | MOVQ (8*1)(y_ptr), t0 |
| | | |
| MOVQ (8*0)(x_ptr), AX | | MOVQ (8*0)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc1 | | ADDQ AX, acc1 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc2 | | ADDQ t1, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc3 | | ADDQ t1, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ DX, acc5 | | ADCQ DX, acc5 |
| ADCQ $0, acc0 | | ADCQ $0, acc0 |
| | | | 559 |
|
| // Second reduction step | | // Second reduction step |
| MOVQ acc1, AX | | MOVQ acc1, AX |
| MOVQ acc1, t1 | | MOVQ acc1, t1 |
| SHLQ $32, acc1 | | SHLQ $32, acc1 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc1, acc2 | | ADDQ acc1, acc2 |
| ADCQ t1, acc3 | | ADCQ t1, acc3 |
| ADCQ AX, acc4 | | ADCQ AX, acc4 |
| ADCQ DX, acc5 | | ADCQ DX, acc5 |
| ADCQ $0, acc0 | | ADCQ $0, acc0 |
| XORQ acc1, acc1 | | XORQ acc1, acc1 |
| | | | 572 |
|
| // x * y[2] | | // x * y[2] |
| MOVQ (8*2)(y_ptr), t0 | | MOVQ (8*2)(y_ptr), t0 |
| | | |
| MOVQ (8*0)(x_ptr), AX | | MOVQ (8*0)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc3 | | ADDQ t1, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc5 | | ADDQ t1, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc5 | | ADDQ AX, acc5 |
| ADCQ DX, acc0 | | ADCQ DX, acc0 |
| ADCQ $0, acc1 | | ADCQ $0, acc1 |
| | | | 605 |
|
| // Third reduction step | | // Third reduction step |
| MOVQ acc2, AX | | MOVQ acc2, AX |
| MOVQ acc2, t1 | | MOVQ acc2, t1 |
| SHLQ $32, acc2 | | SHLQ $32, acc2 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc2, acc3 | | ADDQ acc2, acc3 |
| ADCQ t1, acc4 | | ADCQ t1, acc4 |
| ADCQ AX, acc5 | | ADCQ AX, acc5 |
| ADCQ DX, acc0 | | ADCQ DX, acc0 |
| ADCQ $0, acc1 | | ADCQ $0, acc1 |
| XORQ acc2, acc2 | | XORQ acc2, acc2 |
| | | | 618 |
|
| // x * y[3] | | // x * y[3] |
| MOVQ (8*3)(y_ptr), t0 | | MOVQ (8*3)(y_ptr), t0 |
| | | |
| MOVQ (8*0)(x_ptr), AX | | MOVQ (8*0)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc5 | | ADDQ t1, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc5 | | ADDQ AX, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc0 | | ADDQ t1, acc0 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc0 | | ADDQ AX, acc0 |
| ADCQ DX, acc1 | | ADCQ DX, acc1 |
| ADCQ $0, acc2 | | ADCQ $0, acc2 |
| | | | 651 |
|
| // Last reduction step | | // Last reduction step |
| MOVQ acc3, AX | | MOVQ acc3, AX |
| MOVQ acc3, t1 | | MOVQ acc3, t1 |
| SHLQ $32, acc3 | | SHLQ $32, acc3 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc3, acc4 | | ADDQ acc3, acc4 |
| ADCQ t1, acc5 | | ADCQ t1, acc5 |
| ADCQ AX, acc0 | | ADCQ AX, acc0 |
| ADCQ DX, acc1 | | ADCQ DX, acc1 |
| ADCQ $0, acc2 | | ADCQ $0, acc2 |
| | | | 663 |
|
| // Copy result [255:0] | | // Copy result [255:0] |
| MOVQ acc4, x_ptr | | MOVQ acc4, x_ptr |
| MOVQ acc5, acc3 | | MOVQ acc5, acc3 |
| MOVQ acc0, t0 | | MOVQ acc0, t0 |
| MOVQ acc1, t1 | | MOVQ acc1, t1 |
| | | | 669 |
|
| // Subtract p256 | | // Subtract p256 |
| SUBQ $-1, acc4 | | SUBQ $-1, acc4 |
| 643 | SBBQ p256const0<>(SB) ,acc5 | | 672 | SBBQ p256const0<>(SB), acc5 |
| SBBQ $0, acc0 | | SBBQ $0, acc0 |
| SBBQ p256const1<>(SB), acc1 | | SBBQ p256const1<>(SB), acc1 |
| SBBQ $0, acc2 | | SBBQ $0, acc2 |
| | | |
| CMOVQCS x_ptr, acc4 | | CMOVQCS x_ptr, acc4 |
| CMOVQCS acc3, acc5 | | CMOVQCS acc3, acc5 |
| CMOVQCS t0, acc0 | | CMOVQCS t0, acc0 |
| CMOVQCS t1, acc1 | | CMOVQCS t1, acc1 |
| | | |
| MOVQ acc4, (8*0)(res_ptr) | | MOVQ acc4, (8*0)(res_ptr) |
| MOVQ acc5, (8*1)(res_ptr) | | MOVQ acc5, (8*1)(res_ptr) |
| MOVQ acc0, (8*2)(res_ptr) | | MOVQ acc0, (8*2)(res_ptr) |
| MOVQ acc1, (8*3)(res_ptr) | | MOVQ acc1, (8*3)(res_ptr) |
| | | |
| RET | | RET |
| | | | 688 |
|
| 659 | /* ---------------------------------------*/ | | 689 | // --------------------------------------- |
| // func p256FromMont(res, in []uint64) | | // func p256FromMont(res, in []uint64) |
| 661 | TEXT ·p256FromMont(SB),NOSPLIT,$0 | | 691 | TEXT ·p256FromMont(SB), NOSPLIT, $0 |
| MOVQ res+0(FP), res_ptr | | MOVQ res+0(FP), res_ptr |
| MOVQ in+24(FP), x_ptr | | MOVQ in+24(FP), x_ptr |
| | | |
| MOVQ (8*0)(x_ptr), acc0 | | MOVQ (8*0)(x_ptr), acc0 |
| MOVQ (8*1)(x_ptr), acc1 | | MOVQ (8*1)(x_ptr), acc1 |
| MOVQ (8*2)(x_ptr), acc2 | | MOVQ (8*2)(x_ptr), acc2 |
| MOVQ (8*3)(x_ptr), acc3 | | MOVQ (8*3)(x_ptr), acc3 |
| XORQ acc4, acc4 | | XORQ acc4, acc4 |
| | | |
| // Only reduce, no multiplications are needed | | // Only reduce, no multiplications are needed |
| // First stage | | // First stage |
| MOVQ acc0, AX | | MOVQ acc0, AX |
| MOVQ acc0, t1 | | MOVQ acc0, t1 |
| SHLQ $32, acc0 | | SHLQ $32, acc0 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc0, acc1 | | ADDQ acc0, acc1 |
| ADCQ t1, acc2 | | ADCQ t1, acc2 |
| ADCQ AX, acc3 | | ADCQ AX, acc3 |
| ADCQ DX, acc4 | | ADCQ DX, acc4 |
| XORQ acc5, acc5 | | XORQ acc5, acc5 |
| | | | 713 |
|
| // Second stage | | // Second stage |
| MOVQ acc1, AX | | MOVQ acc1, AX |
| MOVQ acc1, t1 | | MOVQ acc1, t1 |
| SHLQ $32, acc1 | | SHLQ $32, acc1 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc1, acc2 | | ADDQ acc1, acc2 |
| ADCQ t1, acc3 | | ADCQ t1, acc3 |
| ADCQ AX, acc4 | | ADCQ AX, acc4 |
| ADCQ DX, acc5 | | ADCQ DX, acc5 |
| XORQ acc0, acc0 | | XORQ acc0, acc0 |
| | | | 725 |
|
| // Third stage | | // Third stage |
| MOVQ acc2, AX | | MOVQ acc2, AX |
| MOVQ acc2, t1 | | MOVQ acc2, t1 |
| SHLQ $32, acc2 | | SHLQ $32, acc2 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc2, acc3 | | ADDQ acc2, acc3 |
| ADCQ t1, acc4 | | ADCQ t1, acc4 |
| ADCQ AX, acc5 | | ADCQ AX, acc5 |
| ADCQ DX, acc0 | | ADCQ DX, acc0 |
| XORQ acc1, acc1 | | XORQ acc1, acc1 |
| | | | 737 |
|
| // Last stage | | // Last stage |
| MOVQ acc3, AX | | MOVQ acc3, AX |
| MOVQ acc3, t1 | | MOVQ acc3, t1 |
| SHLQ $32, acc3 | | SHLQ $32, acc3 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, t1 | | SHRQ $32, t1 |
| ADDQ acc3, acc4 | | ADDQ acc3, acc4 |
| ADCQ t1, acc5 | | ADCQ t1, acc5 |
| ADCQ AX, acc0 | | ADCQ AX, acc0 |
| ADCQ DX, acc1 | | ADCQ DX, acc1 |
| | | |
| MOVQ acc4, x_ptr | | MOVQ acc4, x_ptr |
| MOVQ acc5, acc3 | | MOVQ acc5, acc3 |
| MOVQ acc0, t0 | | MOVQ acc0, t0 |
| MOVQ acc1, t1 | | MOVQ acc1, t1 |
| | | |
| SUBQ $-1, acc4 | | SUBQ $-1, acc4 |
| SBBQ p256const0<>(SB), acc5 | | SBBQ p256const0<>(SB), acc5 |
| SBBQ $0, acc0 | | SBBQ $0, acc0 |
| SBBQ p256const1<>(SB), acc1 | | SBBQ p256const1<>(SB), acc1 |
| | | |
| CMOVQCS x_ptr, acc4 | | CMOVQCS x_ptr, acc4 |
| CMOVQCS acc3, acc5 | | CMOVQCS acc3, acc5 |
| CMOVQCS t0, acc0 | | CMOVQCS t0, acc0 |
| CMOVQCS t1, acc1 | | CMOVQCS t1, acc1 |
| | | |
| MOVQ acc4, (8*0)(res_ptr) | | MOVQ acc4, (8*0)(res_ptr) |
| MOVQ acc5, (8*1)(res_ptr) | | MOVQ acc5, (8*1)(res_ptr) |
| MOVQ acc0, (8*2)(res_ptr) | | MOVQ acc0, (8*2)(res_ptr) |
| MOVQ acc1, (8*3)(res_ptr) | | MOVQ acc1, (8*3)(res_ptr) |
| | | |
| RET | | RET |
| | | | 770 |
|
| 737 | /* ---------------------------------------*/ | | 771 | // --------------------------------------- |
| // Constant time point access to arbitrary point table. | | // Constant time point access to arbitrary point table. |
| // Indexed from 1 to 15, with -1 offset | | // Indexed from 1 to 15, with -1 offset |
| // (index 0 is implicitly point at infinity) | | // (index 0 is implicitly point at infinity) |
| // func p256Select(p, r []uint64, idx int) | | // func p256Select(p, r []uint64, idx int) |
| 742 | TEXT ·p256Select(SB),NOSPLIT,$0 | | 776 | TEXT ·p256Select(SB), NOSPLIT, $0 |
| 743 | MOVL idx+48(FP),AX | | 777 | MOVL idx+48(FP), AX |
| 744 | MOVQ r+24(FP),DI | | 778 | MOVQ r+24(FP), DI |
| 745 | MOVQ p+0(FP),DX | | 779 | MOVQ p+0(FP), DX |
| | | |
| 747 | PXOR X15, X15 // X15 = 0 | | 781 | PXOR X15, X15 // X15 = 0 |
| 748 | PCMPEQL X14, X14 // X14 = -1 | | 782 | PCMPEQL X14, X14 // X14 = -1 |
| 749 | PSUBL X14, X15 // X15 = 1 | | 783 | PSUBL X14, X15 // X15 = 1 |
| 750 | MOVL AX, X14 | | 784 | MOVL AX, X14 |
| 751 | PSHUFD $0, X14, X14 | | 785 | PSHUFD $0, X14, X14 |
| | | |
| PXOR X0, X0 | | PXOR X0, X0 |
| PXOR X1, X1 | | PXOR X1, X1 |
| PXOR X2, X2 | | PXOR X2, X2 |
| PXOR X3, X3 | | PXOR X3, X3 |
| PXOR X4, X4 | | PXOR X4, X4 |
| PXOR X5, X5 | | PXOR X5, X5 |
| MOVQ $16, AX | | MOVQ $16, AX |
| | | |
| MOVOU X15, X13 | | MOVOU X15, X13 |
| | | |
| loop_select: | | loop_select: |
| | | |
| 765 | MOVOU X13, X12 | | 799 | MOVOU X13, X12 |
| 766 | PADDL X15, X13 | | 800 | PADDL X15, X13 |
| 767 | PCMPEQL X14, X12 | | 801 | PCMPEQL X14, X12 |
| | | |
| 769 | MOVOU (16*0)(DI), X6 | | 803 | MOVOU (16*0)(DI), X6 |
| 770 | MOVOU (16*1)(DI), X7 | | 804 | MOVOU (16*1)(DI), X7 |
| 771 | MOVOU (16*2)(DI), X8 | | 805 | MOVOU (16*2)(DI), X8 |
| 772 | MOVOU (16*3)(DI), X9 | | 806 | MOVOU (16*3)(DI), X9 |
| 773 | MOVOU (16*4)(DI), X10 | | 807 | MOVOU (16*4)(DI), X10 |
| 774 | MOVOU (16*5)(DI), X11 | | 808 | MOVOU (16*5)(DI), X11 |
| 775 | ADDQ $(16*6), DI | | 809 | ADDQ $(16*6), DI |
| | | |
| 777 | PAND X12, X6 | | 811 | PAND X12, X6 |
| 778 | PAND X12, X7 | | 812 | PAND X12, X7 |
| 779 | PAND X12, X8 | | 813 | PAND X12, X8 |
| 780 | PAND X12, X9 | | 814 | PAND X12, X9 |
| 781 | PAND X12, X10 | | 815 | PAND X12, X10 |
| 782 | PAND X12, X11 | | 816 | PAND X12, X11 |
| | | |
| 784 | PXOR X6, X0 | | 818 | PXOR X6, X0 |
| 785 | PXOR X7, X1 | | 819 | PXOR X7, X1 |
| 786 | PXOR X8, X2 | | 820 | PXOR X8, X2 |
| 787 | PXOR X9, X3 | | 821 | PXOR X9, X3 |
| 788 | PXOR X10, X4 | | 822 | PXOR X10, X4 |
| 789 | PXOR X11, X5 | | 823 | PXOR X11, X5 |
| | | |
| 791 | DECQ AX | | 825 | DECQ AX |
| 792 | JNE loop_select | | 826 | JNE loop_select |
| | | |
| MOVOU X0, (16*0)(DX) | | MOVOU X0, (16*0)(DX) |
| MOVOU X1, (16*1)(DX) | | MOVOU X1, (16*1)(DX) |
| MOVOU X2, (16*2)(DX) | | MOVOU X2, (16*2)(DX) |
| MOVOU X3, (16*3)(DX) | | MOVOU X3, (16*3)(DX) |
| MOVOU X4, (16*4)(DX) | | MOVOU X4, (16*4)(DX) |
| MOVOU X5, (16*5)(DX) | | MOVOU X5, (16*5)(DX) |
| | | |
| RET | | RET |
| | | | 836 |
|
| 802 | /* ---------------------------------------*/ | | 837 | // --------------------------------------- |
| // Constant time point access to base point table. | | // Constant time point access to base point table. |
| // func p256SelectBase(p, r []uint64, idx int) | | // func p256SelectBase(p, r []uint64, idx int) |
| 805 | TEXT ·p256SelectBase(SB),NOSPLIT,$0 | | 840 | TEXT ·p256SelectBase(SB), NOSPLIT, $0 |
| 806 | MOVL idx+48(FP),AX | | 841 | MOVL idx+48(FP), AX |
| 807 | MOVQ r+24(FP),DI | | 842 | MOVQ r+24(FP), DI |
| 808 | MOVQ p+0(FP),DX | | 843 | MOVQ p+0(FP), DX |
| | | |
| 810 | PXOR X15, X15 // X15 = 0 | | 845 | PXOR X15, X15 // X15 = 0 |
| 811 | PCMPEQL X14, X14 // X14 = -1 | | 846 | PCMPEQL X14, X14 // X14 = -1 |
| 812 | PSUBL X14, X15 // X15 = 1 | | 847 | PSUBL X14, X15 // X15 = 1 |
| 813 | MOVL AX, X14 | | 848 | MOVL AX, X14 |
| 814 | PSHUFD $0, X14, X14 | | 849 | PSHUFD $0, X14, X14 |
| | | |
| PXOR X0, X0 | | PXOR X0, X0 |
| PXOR X1, X1 | | PXOR X1, X1 |
| PXOR X2, X2 | | PXOR X2, X2 |
| PXOR X3, X3 | | PXOR X3, X3 |
| MOVQ $32, AX | | MOVQ $32, AX |
| | | |
| MOVOU X15, X13 | | MOVOU X15, X13 |
| | | |
| loop_select_base: | | loop_select_base: |
| | | |
| 826 | MOVOU X13, X12 | | 861 | MOVOU X13, X12 |
| 827 | PADDL X15, X13 | | 862 | PADDL X15, X13 |
| 828 | PCMPEQL X14, X12 | | 863 | PCMPEQL X14, X12 |
| | | |
| 830 | MOVOU (16*0)(DI), X4 | | 865 | MOVOU (16*0)(DI), X4 |
| 831 | MOVOU (16*1)(DI), X5 | | 866 | MOVOU (16*1)(DI), X5 |
| 832 | MOVOU (16*2)(DI), X6 | | 867 | MOVOU (16*2)(DI), X6 |
| 833 | MOVOU (16*3)(DI), X7 | | 868 | MOVOU (16*3)(DI), X7 |
| | | |
| 835 | MOVOU (16*4)(DI), X8 | | 870 | MOVOU (16*4)(DI), X8 |
| 836 | MOVOU (16*5)(DI), X9 | | 871 | MOVOU (16*5)(DI), X9 |
| 837 | MOVOU (16*6)(DI), X10 | | 872 | MOVOU (16*6)(DI), X10 |
| 838 | MOVOU (16*7)(DI), X11 | | 873 | MOVOU (16*7)(DI), X11 |
| | | |
| 840 | ADDQ $(16*8), DI | | 875 | ADDQ $(16*8), DI |
| | | |
| 842 | PAND X12, X4 | | 877 | PAND X12, X4 |
| 843 | PAND X12, X5 | | 878 | PAND X12, X5 |
| 844 | PAND X12, X6 | | 879 | PAND X12, X6 |
| 845 | PAND X12, X7 | | 880 | PAND X12, X7 |
| | | |
| 847 | MOVOU X13, X12 | | 882 | MOVOU X13, X12 |
| 848 | PADDL X15, X13 | | 883 | PADDL X15, X13 |
| 849 | PCMPEQL X14, X12 | | 884 | PCMPEQL X14, X12 |
| | | |
| 851 | PAND X12, X8 | | 886 | PAND X12, X8 |
| 852 | PAND X12, X9 | | 887 | PAND X12, X9 |
| 853 | PAND X12, X10 | | 888 | PAND X12, X10 |
| 854 | PAND X12, X11 | | 889 | PAND X12, X11 |
| | | |
| 856 | PXOR X4, X0 | | 891 | PXOR X4, X0 |
| 857 | PXOR X5, X1 | | 892 | PXOR X5, X1 |
| 858 | PXOR X6, X2 | | 893 | PXOR X6, X2 |
| 859 | PXOR X7, X3 | | 894 | PXOR X7, X3 |
| | | |
| 861 | PXOR X8, X0 | | 896 | PXOR X8, X0 |
| 862 | PXOR X9, X1 | | 897 | PXOR X9, X1 |
| 863 | PXOR X10, X2 | | 898 | PXOR X10, X2 |
| 864 | PXOR X11, X3 | | 899 | PXOR X11, X3 |
| | | |
| 866 | DECQ AX | | 901 | DECQ AX |
| 867 | JNE loop_select_base | | 902 | JNE loop_select_base |
| | | |
| MOVOU X0, (16*0)(DX) | | MOVOU X0, (16*0)(DX) |
| MOVOU X1, (16*1)(DX) | | MOVOU X1, (16*1)(DX) |
| MOVOU X2, (16*2)(DX) | | MOVOU X2, (16*2)(DX) |
| MOVOU X3, (16*3)(DX) | | MOVOU X3, (16*3)(DX) |
| | | |
| RET | | RET |
| | | | 910 |
|
| 875 | /* ---------------------------------------*/ | | 911 | // --------------------------------------- |
| // func p256OrdMul(res, in1, in2 []uint64) | | // func p256OrdMul(res, in1, in2 []uint64) |
| 877 | TEXT ·p256OrdMul(SB),NOSPLIT,$0 | | 913 | TEXT ·p256OrdMul(SB), NOSPLIT, $0 |
| MOVQ res+0(FP), res_ptr | | MOVQ res+0(FP), res_ptr |
| MOVQ in1+24(FP), x_ptr | | MOVQ in1+24(FP), x_ptr |
| MOVQ in2+48(FP), y_ptr | | MOVQ in2+48(FP), y_ptr |
| | | | 917 |
|
| // x * y[0] | | // x * y[0] |
| MOVQ (8*0)(y_ptr), t0 | | MOVQ (8*0)(y_ptr), t0 |
| | | |
| MOVQ (8*0)(x_ptr), AX | | MOVQ (8*0)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| MOVQ AX, acc0 | | MOVQ AX, acc0 |
| MOVQ DX, acc1 | | MOVQ DX, acc1 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc1 | | ADDQ AX, acc1 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc2 | | MOVQ DX, acc2 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc3 | | MOVQ DX, acc3 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc4 | | MOVQ DX, acc4 |
| XORQ acc5, acc5 | | XORQ acc5, acc5 |
| | | | 944 |
|
| // First reduction step | | // First reduction step |
| MOVQ acc0, AX | | MOVQ acc0, AX |
| MULQ p256ordK0<>(SB) | | MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 | | MOVQ AX, t0 |
| | | |
| MOVQ p256ord<>+0x00(SB), AX | | MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc0 | | ADDQ AX, acc0 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x08(SB), AX | | MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc1 | | ADDQ t1, acc1 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc1 | | ADDQ AX, acc1 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x10(SB), AX | | MOVQ p256ord<>+0x10(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc2 | | ADDQ t1, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x18(SB), AX | | MOVQ p256ord<>+0x18(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc3 | | ADDQ t1, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ DX, acc4 | | ADCQ DX, acc4 |
| ADCQ $0, acc5 | | ADCQ $0, acc5 |
| | | | 979 |
|
| // x * y[1] | | // x * y[1] |
| MOVQ (8*1)(y_ptr), t0 | | MOVQ (8*1)(y_ptr), t0 |
| | | |
| MOVQ (8*0)(x_ptr), AX | | MOVQ (8*0)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc1 | | ADDQ AX, acc1 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc2 | | ADDQ t1, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc3 | | ADDQ t1, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ DX, acc5 | | ADCQ DX, acc5 |
| ADCQ $0, acc0 | | ADCQ $0, acc0 |
| | | | 1012 |
|
| // Second reduction step | | // Second reduction step |
| MOVQ acc1, AX | | MOVQ acc1, AX |
| MULQ p256ordK0<>(SB) | | MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 | | MOVQ AX, t0 |
| | | |
| MOVQ p256ord<>+0x00(SB), AX | | MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc1 | | ADDQ AX, acc1 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x08(SB), AX | | MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc2 | | ADDQ t1, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x10(SB), AX | | MOVQ p256ord<>+0x10(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc3 | | ADDQ t1, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x18(SB), AX | | MOVQ p256ord<>+0x18(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ DX, acc5 | | ADCQ DX, acc5 |
| ADCQ $0, acc0 | | ADCQ $0, acc0 |
| | | | 1047 |
|
| // x * y[2] | | // x * y[2] |
| MOVQ (8*2)(y_ptr), t0 | | MOVQ (8*2)(y_ptr), t0 |
| | | |
| MOVQ (8*0)(x_ptr), AX | | MOVQ (8*0)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc3 | | ADDQ t1, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc5 | | ADDQ t1, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc5 | | ADDQ AX, acc5 |
| ADCQ DX, acc0 | | ADCQ DX, acc0 |
| ADCQ $0, acc1 | | ADCQ $0, acc1 |
| | | | 1080 |
|
| // Third reduction step | | // Third reduction step |
| MOVQ acc2, AX | | MOVQ acc2, AX |
| MULQ p256ordK0<>(SB) | | MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 | | MOVQ AX, t0 |
| | | |
| MOVQ p256ord<>+0x00(SB), AX | | MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x08(SB), AX | | MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc3 | | ADDQ t1, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x10(SB), AX | | MOVQ p256ord<>+0x10(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x18(SB), AX | | MOVQ p256ord<>+0x18(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc5 | | ADDQ t1, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc5 | | ADDQ AX, acc5 |
| ADCQ DX, acc0 | | ADCQ DX, acc0 |
| ADCQ $0, acc1 | | ADCQ $0, acc1 |
| | | | 1115 |
|
| // x * y[3] | | // x * y[3] |
| MOVQ (8*3)(y_ptr), t0 | | MOVQ (8*3)(y_ptr), t0 |
| | | |
| MOVQ (8*0)(x_ptr), AX | | MOVQ (8*0)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc5 | | ADDQ t1, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc5 | | ADDQ AX, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc0 | | ADDQ t1, acc0 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc0 | | ADDQ AX, acc0 |
| ADCQ DX, acc1 | | ADCQ DX, acc1 |
| ADCQ $0, acc2 | | ADCQ $0, acc2 |
| | | | 1148 |
|
| // Last reduction step | | // Last reduction step |
| MOVQ acc3, AX | | MOVQ acc3, AX |
| MULQ p256ordK0<>(SB) | | MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 | | MOVQ AX, t0 |
| | | |
| MOVQ p256ord<>+0x00(SB), AX | | MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x08(SB), AX | | MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x10(SB), AX | | MOVQ p256ord<>+0x10(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc5 | | ADDQ t1, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc5 | | ADDQ AX, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x18(SB), AX | | MOVQ p256ord<>+0x18(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc0 | | ADDQ t1, acc0 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc0 | | ADDQ AX, acc0 |
| ADCQ DX, acc1 | | ADCQ DX, acc1 |
| ADCQ $0, acc2 | | ADCQ $0, acc2 |
| | | | 1183 |
|
| // Copy result [255:0] | | // Copy result [255:0] |
| MOVQ acc4, x_ptr | | MOVQ acc4, x_ptr |
| MOVQ acc5, acc3 | | MOVQ acc5, acc3 |
| MOVQ acc0, t0 | | MOVQ acc0, t0 |
| MOVQ acc1, t1 | | MOVQ acc1, t1 |
| | | | 1189 |
|
| // Subtract p256 | | // Subtract p256 |
| SUBQ p256ord<>+0x00(SB), acc4 | | SUBQ p256ord<>+0x00(SB), acc4 |
| 1146 | SBBQ p256ord<>+0x08(SB) ,acc5 | | 1192 | SBBQ p256ord<>+0x08(SB), acc5 |
| SBBQ p256ord<>+0x10(SB), acc0 | | SBBQ p256ord<>+0x10(SB), acc0 |
| SBBQ p256ord<>+0x18(SB), acc1 | | SBBQ p256ord<>+0x18(SB), acc1 |
| SBBQ $0, acc2 | | SBBQ $0, acc2 |
| | | |
| CMOVQCS x_ptr, acc4 | | CMOVQCS x_ptr, acc4 |
| CMOVQCS acc3, acc5 | | CMOVQCS acc3, acc5 |
| CMOVQCS t0, acc0 | | CMOVQCS t0, acc0 |
| CMOVQCS t1, acc1 | | CMOVQCS t1, acc1 |
| | | |
| MOVQ acc4, (8*0)(res_ptr) | | MOVQ acc4, (8*0)(res_ptr) |
| MOVQ acc5, (8*1)(res_ptr) | | MOVQ acc5, (8*1)(res_ptr) |
| MOVQ acc0, (8*2)(res_ptr) | | MOVQ acc0, (8*2)(res_ptr) |
| MOVQ acc1, (8*3)(res_ptr) | | MOVQ acc1, (8*3)(res_ptr) |
| | | |
| RET | | RET |
| | | | 1208 |
|
| 1162 | /* ---------------------------------------*/ | | 1209 | // --------------------------------------- |
| // func p256OrdSqr(res, in []uint64, n int) | | // func p256OrdSqr(res, in []uint64, n int) |
| 1164 | TEXT ·p256OrdSqr(SB),NOSPLIT,$0 | | 1211 | TEXT ·p256OrdSqr(SB), NOSPLIT, $0 |
| MOVQ z+0(FP), res_ptr | | MOVQ z+0(FP), res_ptr |
| MOVQ x+24(FP), x_ptr | | MOVQ x+24(FP), x_ptr |
| MOVL n+48(FP), BX | | MOVL n+48(FP), BX |
| | | |
| ordSqrLoop: | | ordSqrLoop: |
| | | |
| // y[1:] * y[0] | | // y[1:] * y[0] |
| MOVQ (8*0)(x_ptr), t0 | | MOVQ (8*0)(x_ptr), t0 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| MOVQ AX, acc1 | | MOVQ AX, acc1 |
| MOVQ DX, acc2 | | MOVQ DX, acc2 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc3 | | MOVQ DX, acc3 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc4 | | MOVQ DX, acc4 |
| | | | 1237 |
|
| // y[2:] * y[1] | | // y[2:] * y[1] |
| MOVQ (8*1)(x_ptr), t0 | | MOVQ (8*1)(x_ptr), t0 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc4 | | ADDQ t1, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc4 | | ADDQ AX, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc5 | | MOVQ DX, acc5 |
| | | | 1254 |
|
| // y[3] * y[2] | | // y[3] * y[2] |
| MOVQ (8*2)(x_ptr), t0 | | MOVQ (8*2)(x_ptr), t0 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc5 | | ADDQ AX, acc5 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, y_ptr | | MOVQ DX, y_ptr |
| XORQ t1, t1 | | XORQ t1, t1 |
| | | | 1264 |
|
| // *2 | | // *2 |
| ADDQ acc1, acc1 | | ADDQ acc1, acc1 |
| ADCQ acc2, acc2 | | ADCQ acc2, acc2 |
| ADCQ acc3, acc3 | | ADCQ acc3, acc3 |
| ADCQ acc4, acc4 | | ADCQ acc4, acc4 |
| ADCQ acc5, acc5 | | ADCQ acc5, acc5 |
| ADCQ y_ptr, y_ptr | | ADCQ y_ptr, y_ptr |
| ADCQ $0, t1 | | ADCQ $0, t1 |
| | | | 1273 |
|
| // Missing products | | // Missing products |
| MOVQ (8*0)(x_ptr), AX | | MOVQ (8*0)(x_ptr), AX |
| MULQ AX | | MULQ AX |
| MOVQ AX, acc0 | | MOVQ AX, acc0 |
| MOVQ DX, t0 | | MOVQ DX, t0 |
| | | |
| MOVQ (8*1)(x_ptr), AX | | MOVQ (8*1)(x_ptr), AX |
| MULQ AX | | MULQ AX |
| ADDQ t0, acc1 | | ADDQ t0, acc1 |
| ADCQ AX, acc2 | | ADCQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t0 | | MOVQ DX, t0 |
| | | |
| MOVQ (8*2)(x_ptr), AX | | MOVQ (8*2)(x_ptr), AX |
| MULQ AX | | MULQ AX |
| ADDQ t0, acc3 | | ADDQ t0, acc3 |
| ADCQ AX, acc4 | | ADCQ AX, acc4 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t0 | | MOVQ DX, t0 |
| | | |
| MOVQ (8*3)(x_ptr), AX | | MOVQ (8*3)(x_ptr), AX |
| MULQ AX | | MULQ AX |
| ADDQ t0, acc5 | | ADDQ t0, acc5 |
| ADCQ AX, y_ptr | | ADCQ AX, y_ptr |
| ADCQ DX, t1 | | ADCQ DX, t1 |
| MOVQ t1, x_ptr | | MOVQ t1, x_ptr |
| | | | 1300 |
|
| // First reduction step | | // First reduction step |
| MOVQ acc0, AX | | MOVQ acc0, AX |
| MULQ p256ordK0<>(SB) | | MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 | | MOVQ AX, t0 |
| | | |
| MOVQ p256ord<>+0x00(SB), AX | | MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc0 | | ADDQ AX, acc0 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x08(SB), AX | | MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc1 | | ADDQ t1, acc1 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc1 | | ADDQ AX, acc1 |
| | | |
| MOVQ t0, t1 | | MOVQ t0, t1 |
| ADCQ DX, acc2 | | ADCQ DX, acc2 |
| ADCQ $0, t1 | | ADCQ $0, t1 |
| SUBQ t0, acc2 | | SUBQ t0, acc2 |
| SBBQ $0, t1 | | SBBQ $0, t1 |
| | | |
| MOVQ t0, AX | | MOVQ t0, AX |
| MOVQ t0, DX | | MOVQ t0, DX |
| MOVQ t0, acc0 | | MOVQ t0, acc0 |
| SHLQ $32, AX | | SHLQ $32, AX |
| SHRQ $32, DX | | SHRQ $32, DX |
| | | |
| ADDQ t1, acc3 | | ADDQ t1, acc3 |
| ADCQ $0, acc0 | | ADCQ $0, acc0 |
| SUBQ AX, acc3 | | SUBQ AX, acc3 |
| SBBQ DX, acc0 | | SBBQ DX, acc0 |
| | | | 1334 |
|
| // Second reduction step | | // Second reduction step |
| MOVQ acc1, AX | | MOVQ acc1, AX |
| MULQ p256ordK0<>(SB) | | MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 | | MOVQ AX, t0 |
| | | |
| MOVQ p256ord<>+0x00(SB), AX | | MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc1 | | ADDQ AX, acc1 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x08(SB), AX | | MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc2 | | ADDQ t1, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| | | |
| MOVQ t0, t1 | | MOVQ t0, t1 |
| ADCQ DX, acc3 | | ADCQ DX, acc3 |
| ADCQ $0, t1 | | ADCQ $0, t1 |
| SUBQ t0, acc3 | | SUBQ t0, acc3 |
| SBBQ $0, t1 | | SBBQ $0, t1 |
| | | |
| MOVQ t0, AX | | MOVQ t0, AX |
| MOVQ t0, DX | | MOVQ t0, DX |
| MOVQ t0, acc1 | | MOVQ t0, acc1 |
| SHLQ $32, AX | | SHLQ $32, AX |
| SHRQ $32, DX | | SHRQ $32, DX |
| | | |
| ADDQ t1, acc0 | | ADDQ t1, acc0 |
| ADCQ $0, acc1 | | ADCQ $0, acc1 |
| SUBQ AX, acc0 | | SUBQ AX, acc0 |
| SBBQ DX, acc1 | | SBBQ DX, acc1 |
| | | | 1368 |
|
| // Third reduction step | | // Third reduction step |
| MOVQ acc2, AX | | MOVQ acc2, AX |
| MULQ p256ordK0<>(SB) | | MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 | | MOVQ AX, t0 |
| | | |
| MOVQ p256ord<>+0x00(SB), AX | | MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc2 | | ADDQ AX, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x08(SB), AX | | MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc3 | | ADDQ t1, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| | | |
| MOVQ t0, t1 | | MOVQ t0, t1 |
| ADCQ DX, acc0 | | ADCQ DX, acc0 |
| ADCQ $0, t1 | | ADCQ $0, t1 |
| SUBQ t0, acc0 | | SUBQ t0, acc0 |
| SBBQ $0, t1 | | SBBQ $0, t1 |
| | | |
| MOVQ t0, AX | | MOVQ t0, AX |
| MOVQ t0, DX | | MOVQ t0, DX |
| MOVQ t0, acc2 | | MOVQ t0, acc2 |
| SHLQ $32, AX | | SHLQ $32, AX |
| SHRQ $32, DX | | SHRQ $32, DX |
| | | |
| ADDQ t1, acc1 | | ADDQ t1, acc1 |
| ADCQ $0, acc2 | | ADCQ $0, acc2 |
| SUBQ AX, acc1 | | SUBQ AX, acc1 |
| SBBQ DX, acc2 | | SBBQ DX, acc2 |
| | | | 1402 |
|
| // Last reduction step | | // Last reduction step |
| MOVQ acc3, AX | | MOVQ acc3, AX |
| MULQ p256ordK0<>(SB) | | MULQ p256ordK0<>(SB) |
| MOVQ AX, t0 | | MOVQ AX, t0 |
| | | |
| MOVQ p256ord<>+0x00(SB), AX | | MOVQ p256ord<>+0x00(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ AX, acc3 | | ADDQ AX, acc3 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ p256ord<>+0x08(SB), AX | | MOVQ p256ord<>+0x08(SB), AX |
| MULQ t0 | | MULQ t0 |
| ADDQ t1, acc0 | | ADDQ t1, acc0 |
| ADCQ $0, DX | | ADCQ $0, DX |
| ADDQ AX, acc0 | | ADDQ AX, acc0 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, t1 | | MOVQ DX, t1 |
| | | |
| MOVQ t0, t1 | | MOVQ t0, t1 |
| ADCQ DX, acc1 | | ADCQ DX, acc1 |
| ADCQ $0, t1 | | ADCQ $0, t1 |
| SUBQ t0, acc1 | | SUBQ t0, acc1 |
| SBBQ $0, t1 | | SBBQ $0, t1 |
| | | |
| MOVQ t0, AX | | MOVQ t0, AX |
| MOVQ t0, DX | | MOVQ t0, DX |
| MOVQ t0, acc3 | | MOVQ t0, acc3 |
| SHLQ $32, AX | | SHLQ $32, AX |
| SHRQ $32, DX | | SHRQ $32, DX |
| | | |
| ADDQ t1, acc2 | | ADDQ t1, acc2 |
| ADCQ $0, acc3 | | ADCQ $0, acc3 |
| SUBQ AX, acc2 | | SUBQ AX, acc2 |
| SBBQ DX, acc3 | | SBBQ DX, acc3 |
| XORQ t0, t0 | | XORQ t0, t0 |
| | | | 1439 |
|
| // Add bits [511:256] of the sqr result | | // Add bits [511:256] of the sqr result |
| ADCQ acc4, acc0 | | ADCQ acc4, acc0 |
| ADCQ acc5, acc1 | | ADCQ acc5, acc1 |
| ADCQ y_ptr, acc2 | | ADCQ y_ptr, acc2 |
| ADCQ x_ptr, acc3 | | ADCQ x_ptr, acc3 |
| ADCQ $0, t0 | | ADCQ $0, t0 |
| | | |
| MOVQ acc0, acc4 | | MOVQ acc0, acc4 |
| MOVQ acc1, acc5 | | MOVQ acc1, acc5 |
| MOVQ acc2, y_ptr | | MOVQ acc2, y_ptr |
| MOVQ acc3, t1 | | MOVQ acc3, t1 |
| | | | 1451 |
|
| // Subtract p256 | | // Subtract p256 |
| SUBQ p256ord<>+0x00(SB), acc0 | | SUBQ p256ord<>+0x00(SB), acc0 |
| 1397 | SBBQ p256ord<>+0x08(SB) ,acc1 | | 1454 | SBBQ p256ord<>+0x08(SB), acc1 |
| SBBQ p256ord<>+0x10(SB), acc2 | | SBBQ p256ord<>+0x10(SB), acc2 |
| SBBQ p256ord<>+0x18(SB), acc3 | | SBBQ p256ord<>+0x18(SB), acc3 |
| SBBQ $0, t0 | | SBBQ $0, t0 |
| | | |
| CMOVQCS acc4, acc0 | | CMOVQCS acc4, acc0 |
| CMOVQCS acc5, acc1 | | CMOVQCS acc5, acc1 |
| CMOVQCS y_ptr, acc2 | | CMOVQCS y_ptr, acc2 |
| CMOVQCS t1, acc3 | | CMOVQCS t1, acc3 |
| | | |
| MOVQ acc0, (8*0)(res_ptr) | | MOVQ acc0, (8*0)(res_ptr) |
| MOVQ acc1, (8*1)(res_ptr) | | MOVQ acc1, (8*1)(res_ptr) |
| MOVQ acc2, (8*2)(res_ptr) | | MOVQ acc2, (8*2)(res_ptr) |
| MOVQ acc3, (8*3)(res_ptr) | | MOVQ acc3, (8*3)(res_ptr) |
| MOVQ res_ptr, x_ptr | | MOVQ res_ptr, x_ptr |
| DECQ BX | | DECQ BX |
| 1413 | JNE ordSqrLoop | | 1470 | JNE ordSqrLoop |
| | | |
| RET | | RET |
| | | | 1473 |
|
| 1416 | /* ---------------------------------------*/ | | 1474 | // --------------------------------------- |
| #undef res_ptr | | #undef res_ptr |
| #undef x_ptr | | #undef x_ptr |
| #undef y_ptr | | #undef y_ptr |
| | | |
| #undef acc0 | | #undef acc0 |
| #undef acc1 | | #undef acc1 |
| #undef acc2 | | #undef acc2 |
| #undef acc3 | | #undef acc3 |
| #undef acc4 | | #undef acc4 |
| #undef acc5 | | #undef acc5 |
| #undef t0 | | #undef t0 |
| #undef t1 | | #undef t1 |
| 1429 | /* ---------------------------------------*/ | | 1487 | // --------------------------------------- |
| #define mul0 AX | | #define mul0 AX |
| #define mul1 DX | | #define mul1 DX |
| #define acc0 BX | | #define acc0 BX |
| #define acc1 CX | | #define acc1 CX |
| #define acc2 R8 | | #define acc2 R8 |
| #define acc3 R9 | | #define acc3 R9 |
| #define acc4 R10 | | #define acc4 R10 |
| #define acc5 R11 | | #define acc5 R11 |
| #define acc6 R12 | | #define acc6 R12 |
| #define acc7 R13 | | #define acc7 R13 |
| #define t0 R14 | | #define t0 R14 |
| #define t1 R15 | | #define t1 R15 |
| #define t2 DI | | #define t2 DI |
| #define t3 SI | | #define t3 SI |
| #define hlp BP | | #define hlp BP |
| 1445 | /* ---------------------------------------*/ | | 1503 | // --------------------------------------- |
| 1446 | TEXT p256SubInternal(SB),NOSPLIT,$0 | | 1504 | TEXT p256SubInternal(SB), NOSPLIT, $0 |
| XORQ mul0, mul0 | | XORQ mul0, mul0 |
| SUBQ t0, acc4 | | SUBQ t0, acc4 |
| SBBQ t1, acc5 | | SBBQ t1, acc5 |
| SBBQ t2, acc6 | | SBBQ t2, acc6 |
| SBBQ t3, acc7 | | SBBQ t3, acc7 |
| SBBQ $0, mul0 | | SBBQ $0, mul0 |
| | | |
| MOVQ acc4, acc0 | | MOVQ acc4, acc0 |
| MOVQ acc5, acc1 | | MOVQ acc5, acc1 |
| MOVQ acc6, acc2 | | MOVQ acc6, acc2 |
| MOVQ acc7, acc3 | | MOVQ acc7, acc3 |
| | | |
| ADDQ $-1, acc4 | | ADDQ $-1, acc4 |
| ADCQ p256const0<>(SB), acc5 | | ADCQ p256const0<>(SB), acc5 |
| ADCQ $0, acc6 | | ADCQ $0, acc6 |
| ADCQ p256const1<>(SB), acc7 | | ADCQ p256const1<>(SB), acc7 |
| ADCQ $0, mul0 | | ADCQ $0, mul0 |
| | | |
| CMOVQNE acc0, acc4 | | CMOVQNE acc0, acc4 |
| CMOVQNE acc1, acc5 | | CMOVQNE acc1, acc5 |
| CMOVQNE acc2, acc6 | | CMOVQNE acc2, acc6 |
| CMOVQNE acc3, acc7 | | CMOVQNE acc3, acc7 |
| | | |
| RET | | RET |
| | | | 1529 |
|
| 1471 | /* ---------------------------------------*/ | | 1530 | // --------------------------------------- |
| 1472 | TEXT p256MulInternal(SB),NOSPLIT,$0 | | 1531 | TEXT p256MulInternal(SB), NOSPLIT, $0 |
| MOVQ acc4, mul0 | | MOVQ acc4, mul0 |
| MULQ t0 | | MULQ t0 |
| MOVQ mul0, acc0 | | MOVQ mul0, acc0 |
| MOVQ mul1, acc1 | | MOVQ mul1, acc1 |
| | | |
| MOVQ acc4, mul0 | | MOVQ acc4, mul0 |
| MULQ t1 | | MULQ t1 |
| ADDQ mul0, acc1 | | ADDQ mul0, acc1 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc2 | | MOVQ mul1, acc2 |
| | | |
| MOVQ acc4, mul0 | | MOVQ acc4, mul0 |
| MULQ t2 | | MULQ t2 |
| ADDQ mul0, acc2 | | ADDQ mul0, acc2 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc3 | | MOVQ mul1, acc3 |
| | | |
| MOVQ acc4, mul0 | | MOVQ acc4, mul0 |
| MULQ t3 | | MULQ t3 |
| ADDQ mul0, acc3 | | ADDQ mul0, acc3 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc4 | | MOVQ mul1, acc4 |
| | | |
| MOVQ acc5, mul0 | | MOVQ acc5, mul0 |
| MULQ t0 | | MULQ t0 |
| ADDQ mul0, acc1 | | ADDQ mul0, acc1 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, hlp | | MOVQ mul1, hlp |
| | | |
| MOVQ acc5, mul0 | | MOVQ acc5, mul0 |
| MULQ t1 | | MULQ t1 |
| ADDQ hlp, acc2 | | ADDQ hlp, acc2 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| ADDQ mul0, acc2 | | ADDQ mul0, acc2 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, hlp | | MOVQ mul1, hlp |
| | | |
| MOVQ acc5, mul0 | | MOVQ acc5, mul0 |
| MULQ t2 | | MULQ t2 |
| ADDQ hlp, acc3 | | ADDQ hlp, acc3 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| ADDQ mul0, acc3 | | ADDQ mul0, acc3 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, hlp | | MOVQ mul1, hlp |
| | | |
| MOVQ acc5, mul0 | | MOVQ acc5, mul0 |
| MULQ t3 | | MULQ t3 |
| ADDQ hlp, acc4 | | ADDQ hlp, acc4 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| ADDQ mul0, acc4 | | ADDQ mul0, acc4 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc5 | | MOVQ mul1, acc5 |
| | | |
| MOVQ acc6, mul0 | | MOVQ acc6, mul0 |
| MULQ t0 | | MULQ t0 |
| ADDQ mul0, acc2 | | ADDQ mul0, acc2 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, hlp | | MOVQ mul1, hlp |
| | | |
| MOVQ acc6, mul0 | | MOVQ acc6, mul0 |
| MULQ t1 | | MULQ t1 |
| ADDQ hlp, acc3 | | ADDQ hlp, acc3 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| ADDQ mul0, acc3 | | ADDQ mul0, acc3 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, hlp | | MOVQ mul1, hlp |
| | | |
| MOVQ acc6, mul0 | | MOVQ acc6, mul0 |
| MULQ t2 | | MULQ t2 |
| ADDQ hlp, acc4 | | ADDQ hlp, acc4 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| ADDQ mul0, acc4 | | ADDQ mul0, acc4 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, hlp | | MOVQ mul1, hlp |
| | | |
| MOVQ acc6, mul0 | | MOVQ acc6, mul0 |
| MULQ t3 | | MULQ t3 |
| ADDQ hlp, acc5 | | ADDQ hlp, acc5 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| ADDQ mul0, acc5 | | ADDQ mul0, acc5 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc6 | | MOVQ mul1, acc6 |
| | | |
| MOVQ acc7, mul0 | | MOVQ acc7, mul0 |
| MULQ t0 | | MULQ t0 |
| ADDQ mul0, acc3 | | ADDQ mul0, acc3 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, hlp | | MOVQ mul1, hlp |
| | | |
| MOVQ acc7, mul0 | | MOVQ acc7, mul0 |
| MULQ t1 | | MULQ t1 |
| ADDQ hlp, acc4 | | ADDQ hlp, acc4 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| ADDQ mul0, acc4 | | ADDQ mul0, acc4 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, hlp | | MOVQ mul1, hlp |
| | | |
| MOVQ acc7, mul0 | | MOVQ acc7, mul0 |
| MULQ t2 | | MULQ t2 |
| ADDQ hlp, acc5 | | ADDQ hlp, acc5 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| ADDQ mul0, acc5 | | ADDQ mul0, acc5 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, hlp | | MOVQ mul1, hlp |
| | | |
| MOVQ acc7, mul0 | | MOVQ acc7, mul0 |
| MULQ t3 | | MULQ t3 |
| ADDQ hlp, acc6 | | ADDQ hlp, acc6 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| ADDQ mul0, acc6 | | ADDQ mul0, acc6 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc7 | | MOVQ mul1, acc7 |
| | | | 1644 |
|
| // First reduction step | | // First reduction step |
| MOVQ acc0, mul0 | | MOVQ acc0, mul0 |
| MOVQ acc0, hlp | | MOVQ acc0, hlp |
| SHLQ $32, acc0 | | SHLQ $32, acc0 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, hlp | | SHRQ $32, hlp |
| ADDQ acc0, acc1 | | ADDQ acc0, acc1 |
| ADCQ hlp, acc2 | | ADCQ hlp, acc2 |
| ADCQ mul0, acc3 | | ADCQ mul0, acc3 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc0 | | MOVQ mul1, acc0 |
| | | | 1656 |
|
| // Second reduction step | | // Second reduction step |
| MOVQ acc1, mul0 | | MOVQ acc1, mul0 |
| MOVQ acc1, hlp | | MOVQ acc1, hlp |
| SHLQ $32, acc1 | | SHLQ $32, acc1 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, hlp | | SHRQ $32, hlp |
| ADDQ acc1, acc2 | | ADDQ acc1, acc2 |
| ADCQ hlp, acc3 | | ADCQ hlp, acc3 |
| ADCQ mul0, acc0 | | ADCQ mul0, acc0 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc1 | | MOVQ mul1, acc1 |
| | | | 1668 |
|
| // Third reduction step | | // Third reduction step |
| MOVQ acc2, mul0 | | MOVQ acc2, mul0 |
| MOVQ acc2, hlp | | MOVQ acc2, hlp |
| SHLQ $32, acc2 | | SHLQ $32, acc2 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, hlp | | SHRQ $32, hlp |
| ADDQ acc2, acc3 | | ADDQ acc2, acc3 |
| ADCQ hlp, acc0 | | ADCQ hlp, acc0 |
| ADCQ mul0, acc1 | | ADCQ mul0, acc1 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc2 | | MOVQ mul1, acc2 |
| | | | 1680 |
|
| // Last reduction step | | // Last reduction step |
| MOVQ acc3, mul0 | | MOVQ acc3, mul0 |
| MOVQ acc3, hlp | | MOVQ acc3, hlp |
| SHLQ $32, acc3 | | SHLQ $32, acc3 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, hlp | | SHRQ $32, hlp |
| ADDQ acc3, acc0 | | ADDQ acc3, acc0 |
| ADCQ hlp, acc1 | | ADCQ hlp, acc1 |
| ADCQ mul0, acc2 | | ADCQ mul0, acc2 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc3 | | MOVQ mul1, acc3 |
| 1629 | BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP | | 1692 | BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP |
| | | | 1693 |
|
| // Add bits [511:256] of the result | | // Add bits [511:256] of the result |
| ADCQ acc0, acc4 | | ADCQ acc0, acc4 |
| ADCQ acc1, acc5 | | ADCQ acc1, acc5 |
| ADCQ acc2, acc6 | | ADCQ acc2, acc6 |
| ADCQ acc3, acc7 | | ADCQ acc3, acc7 |
| ADCQ $0, hlp | | ADCQ $0, hlp |
| | | | 1700 |
|
| // Copy result | | // Copy result |
| MOVQ acc4, acc0 | | MOVQ acc4, acc0 |
| MOVQ acc5, acc1 | | MOVQ acc5, acc1 |
| MOVQ acc6, acc2 | | MOVQ acc6, acc2 |
| MOVQ acc7, acc3 | | MOVQ acc7, acc3 |
| | | | 1706 |
|
| // Subtract p256 | | // Subtract p256 |
| SUBQ $-1, acc4 | | SUBQ $-1, acc4 |
| 1643 | SBBQ p256const0<>(SB) ,acc5 | | 1709 | SBBQ p256const0<>(SB), acc5 |
| SBBQ $0, acc6 | | SBBQ $0, acc6 |
| SBBQ p256const1<>(SB), acc7 | | SBBQ p256const1<>(SB), acc7 |
| SBBQ $0, hlp | | SBBQ $0, hlp |
| | | | 1713 |
|
| // If the result of the subtraction is negative, restore the previous result | | // If the result of the subtraction is negative, restore the previous result |
| CMOVQCS acc0, acc4 | | CMOVQCS acc0, acc4 |
| CMOVQCS acc1, acc5 | | CMOVQCS acc1, acc5 |
| CMOVQCS acc2, acc6 | | CMOVQCS acc2, acc6 |
| CMOVQCS acc3, acc7 | | CMOVQCS acc3, acc7 |
| | | |
| RET | | RET |
| | | | 1721 |
|
| 1654 | /* ---------------------------------------*/ | | 1722 | // --------------------------------------- |
| 1655 | TEXT p256SqrInternal(SB),NOSPLIT,$0 | | 1723 | TEXT p256SqrInternal(SB), NOSPLIT, $0 |
| | | |
| MOVQ acc4, mul0 | | MOVQ acc4, mul0 |
| MULQ acc5 | | MULQ acc5 |
| MOVQ mul0, acc1 | | MOVQ mul0, acc1 |
| MOVQ mul1, acc2 | | MOVQ mul1, acc2 |
| | | |
| MOVQ acc4, mul0 | | MOVQ acc4, mul0 |
| MULQ acc6 | | MULQ acc6 |
| ADDQ mul0, acc2 | | ADDQ mul0, acc2 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc3 | | MOVQ mul1, acc3 |
| | | |
| MOVQ acc4, mul0 | | MOVQ acc4, mul0 |
| MULQ acc7 | | MULQ acc7 |
| ADDQ mul0, acc3 | | ADDQ mul0, acc3 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, t0 | | MOVQ mul1, t0 |
| | | |
| MOVQ acc5, mul0 | | MOVQ acc5, mul0 |
| MULQ acc6 | | MULQ acc6 |
| ADDQ mul0, acc3 | | ADDQ mul0, acc3 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, hlp | | MOVQ mul1, hlp |
| | | |
| MOVQ acc5, mul0 | | MOVQ acc5, mul0 |
| MULQ acc7 | | MULQ acc7 |
| ADDQ hlp, t0 | | ADDQ hlp, t0 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| ADDQ mul0, t0 | | ADDQ mul0, t0 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, t1 | | MOVQ mul1, t1 |
| | | |
| MOVQ acc6, mul0 | | MOVQ acc6, mul0 |
| MULQ acc7 | | MULQ acc7 |
| ADDQ mul0, t1 | | ADDQ mul0, t1 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, t2 | | MOVQ mul1, t2 |
| XORQ t3, t3 | | XORQ t3, t3 |
| | | | 1762 |
|
| // *2 | | // *2 |
| ADDQ acc1, acc1 | | ADDQ acc1, acc1 |
| ADCQ acc2, acc2 | | ADCQ acc2, acc2 |
| ADCQ acc3, acc3 | | ADCQ acc3, acc3 |
| ADCQ t0, t0 | | ADCQ t0, t0 |
| ADCQ t1, t1 | | ADCQ t1, t1 |
| ADCQ t2, t2 | | ADCQ t2, t2 |
| ADCQ $0, t3 | | ADCQ $0, t3 |
| | | | 1771 |
|
| // Missing products | | // Missing products |
| MOVQ acc4, mul0 | | MOVQ acc4, mul0 |
| MULQ mul0 | | MULQ mul0 |
| MOVQ mul0, acc0 | | MOVQ mul0, acc0 |
| MOVQ DX, acc4 | | MOVQ DX, acc4 |
| | | |
| MOVQ acc5, mul0 | | MOVQ acc5, mul0 |
| MULQ mul0 | | MULQ mul0 |
| ADDQ acc4, acc1 | | ADDQ acc4, acc1 |
| ADCQ mul0, acc2 | | ADCQ mul0, acc2 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc4 | | MOVQ DX, acc4 |
| | | |
| MOVQ acc6, mul0 | | MOVQ acc6, mul0 |
| MULQ mul0 | | MULQ mul0 |
| ADDQ acc4, acc3 | | ADDQ acc4, acc3 |
| ADCQ mul0, t0 | | ADCQ mul0, t0 |
| ADCQ $0, DX | | ADCQ $0, DX |
| MOVQ DX, acc4 | | MOVQ DX, acc4 |
| | | |
| MOVQ acc7, mul0 | | MOVQ acc7, mul0 |
| MULQ mul0 | | MULQ mul0 |
| ADDQ acc4, t1 | | ADDQ acc4, t1 |
| ADCQ mul0, t2 | | ADCQ mul0, t2 |
| ADCQ DX, t3 | | ADCQ DX, t3 |
| | | | 1797 |
|
| // First reduction step | | // First reduction step |
| MOVQ acc0, mul0 | | MOVQ acc0, mul0 |
| MOVQ acc0, hlp | | MOVQ acc0, hlp |
| SHLQ $32, acc0 | | SHLQ $32, acc0 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, hlp | | SHRQ $32, hlp |
| ADDQ acc0, acc1 | | ADDQ acc0, acc1 |
| ADCQ hlp, acc2 | | ADCQ hlp, acc2 |
| ADCQ mul0, acc3 | | ADCQ mul0, acc3 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc0 | | MOVQ mul1, acc0 |
| | | | 1809 |
|
| // Second reduction step | | // Second reduction step |
| MOVQ acc1, mul0 | | MOVQ acc1, mul0 |
| MOVQ acc1, hlp | | MOVQ acc1, hlp |
| SHLQ $32, acc1 | | SHLQ $32, acc1 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, hlp | | SHRQ $32, hlp |
| ADDQ acc1, acc2 | | ADDQ acc1, acc2 |
| ADCQ hlp, acc3 | | ADCQ hlp, acc3 |
| ADCQ mul0, acc0 | | ADCQ mul0, acc0 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc1 | | MOVQ mul1, acc1 |
| | | | 1821 |
|
| // Third reduction step | | // Third reduction step |
| MOVQ acc2, mul0 | | MOVQ acc2, mul0 |
| MOVQ acc2, hlp | | MOVQ acc2, hlp |
| SHLQ $32, acc2 | | SHLQ $32, acc2 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, hlp | | SHRQ $32, hlp |
| ADDQ acc2, acc3 | | ADDQ acc2, acc3 |
| ADCQ hlp, acc0 | | ADCQ hlp, acc0 |
| ADCQ mul0, acc1 | | ADCQ mul0, acc1 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc2 | | MOVQ mul1, acc2 |
| | | | 1833 |
|
| // Last reduction step | | // Last reduction step |
| MOVQ acc3, mul0 | | MOVQ acc3, mul0 |
| MOVQ acc3, hlp | | MOVQ acc3, hlp |
| SHLQ $32, acc3 | | SHLQ $32, acc3 |
| MULQ p256const1<>(SB) | | MULQ p256const1<>(SB) |
| SHRQ $32, hlp | | SHRQ $32, hlp |
| ADDQ acc3, acc0 | | ADDQ acc3, acc0 |
| ADCQ hlp, acc1 | | ADCQ hlp, acc1 |
| ADCQ mul0, acc2 | | ADCQ mul0, acc2 |
| ADCQ $0, mul1 | | ADCQ $0, mul1 |
| MOVQ mul1, acc3 | | MOVQ mul1, acc3 |
| 1771 | BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP | | 1845 | BYTE $0x48; BYTE $0xc7; BYTE $0xc5; BYTE $0x00; BYTE $0x00; BYTE $0x00; BYTE $0x00 // MOVQ $0, BP |
| | | | 1846 |
|
| // Add bits [511:256] of the result | | // Add bits [511:256] of the result |
| ADCQ acc0, t0 | | ADCQ acc0, t0 |
| ADCQ acc1, t1 | | ADCQ acc1, t1 |
| ADCQ acc2, t2 | | ADCQ acc2, t2 |
| ADCQ acc3, t3 | | ADCQ acc3, t3 |
| ADCQ $0, hlp | | ADCQ $0, hlp |
| | | | 1853 |
|
| // Copy result | | // Copy result |
| MOVQ t0, acc4 | | MOVQ t0, acc4 |
| MOVQ t1, acc5 | | MOVQ t1, acc5 |
| MOVQ t2, acc6 | | MOVQ t2, acc6 |
| MOVQ t3, acc7 | | MOVQ t3, acc7 |
| | | | 1859 |
|
| // Subtract p256 | | // Subtract p256 |
| SUBQ $-1, acc4 | | SUBQ $-1, acc4 |
| 1785 | SBBQ p256const0<>(SB) ,acc5 | | 1862 | SBBQ p256const0<>(SB), acc5 |
| SBBQ $0, acc6 | | SBBQ $0, acc6 |
| SBBQ p256const1<>(SB), acc7 | | SBBQ p256const1<>(SB), acc7 |
| SBBQ $0, hlp | | SBBQ $0, hlp |
| | | | 1866 |
|
| // If the result of the subtraction is negative, restore the previous result | | // If the result of the subtraction is negative, restore the previous result |
| CMOVQCS t0, acc4 | | CMOVQCS t0, acc4 |
| CMOVQCS t1, acc5 | | CMOVQCS t1, acc5 |
| CMOVQCS t2, acc6 | | CMOVQCS t2, acc6 |
| CMOVQCS t3, acc7 | | CMOVQCS t3, acc7 |
| | | |
| RET | | RET |
| | | | 1874 |
|
| 1796 | /* ---------------------------------------*/ | | 1875 | // --------------------------------------- |
| 1797 | #define p256MulBy2Inline\ | | 1876 | #define p256MulBy2Inline \ |
| 1798 | XORQ mul0, mul0;\ | | 1877 | XORQ mul0, mul0; \ |
| 1799 | ADDQ acc4, acc4;\ | | 1878 | ADDQ acc4, acc4; \ |
| 1800 | ADCQ acc5, acc5;\ | | 1879 | ADCQ acc5, acc5; \ |
| 1801 | ADCQ acc6, acc6;\ | | 1880 | ADCQ acc6, acc6; \ |
| 1802 | ADCQ acc7, acc7;\ | | 1881 | ADCQ acc7, acc7; \ |
| 1803 | ADCQ $0, mul0;\ | | 1882 | ADCQ $0, mul0; \ |
| 1804 | MOVQ acc4, t0;\ | | 1883 | MOVQ acc4, t0; \ |
| 1805 | MOVQ acc5, t1;\ | | 1884 | MOVQ acc5, t1; \ |
| 1806 | MOVQ acc6, t2;\ | | 1885 | MOVQ acc6, t2; \ |
| 1807 | MOVQ acc7, t3;\ | | 1886 | MOVQ acc7, t3; \ |
| 1808 | SUBQ $-1, t0;\ | | 1887 | SUBQ $-1, t0; \ |
| 1809 | SBBQ p256const0<>(SB), t1;\ | | 1888 | SBBQ p256const0<>(SB), t1; \ |
| 1810 | SBBQ $0, t2;\ | | 1889 | SBBQ $0, t2; \ |
| 1811 | SBBQ p256const1<>(SB), t3;\ | | 1890 | SBBQ p256const1<>(SB), t3; \ |
| 1812 | SBBQ $0, mul0;\ | | 1891 | SBBQ $0, mul0; \ |
| 1813 | CMOVQCS acc4, t0;\ | | 1892 | CMOVQCS acc4, t0; \ |
| 1814 | CMOVQCS acc5, t1;\ | | 1893 | CMOVQCS acc5, t1; \ |
| 1815 | CMOVQCS acc6, t2;\ | | 1894 | CMOVQCS acc6, t2; \ |
| 1816 | CMOVQCS acc7, t3; | | 1895 | CMOVQCS acc7, t3 |
| | | | 1896 |
|
| 1817 | /* ---------------------------------------*/ | | 1897 | // --------------------------------------- |
| #define p256AddInline \ | | #define p256AddInline \ |
| 1819 | XORQ mul0, mul0;\ | | 1899 | XORQ mul0, mul0; \ |
| 1820 | ADDQ t0, acc4;\ | | 1900 | ADDQ t0, acc4; \ |
| 1821 | ADCQ t1, acc5;\ | | 1901 | ADCQ t1, acc5; \ |
| 1822 | ADCQ t2, acc6;\ | | 1902 | ADCQ t2, acc6; \ |
| 1823 | ADCQ t3, acc7;\ | | 1903 | ADCQ t3, acc7; \ |
| 1824 | ADCQ $0, mul0;\ | | 1904 | ADCQ $0, mul0; \ |
| 1825 | MOVQ acc4, t0;\ | | 1905 | MOVQ acc4, t0; \ |
| 1826 | MOVQ acc5, t1;\ | | 1906 | MOVQ acc5, t1; \ |
| 1827 | MOVQ acc6, t2;\ | | 1907 | MOVQ acc6, t2; \ |
| 1828 | MOVQ acc7, t3;\ | | 1908 | MOVQ acc7, t3; \ |
| 1829 | SUBQ $-1, t0;\ | | 1909 | SUBQ $-1, t0; \ |
| 1830 | SBBQ p256const0<>(SB), t1;\ | | 1910 | SBBQ p256const0<>(SB), t1; \ |
| 1831 | SBBQ $0, t2;\ | | 1911 | SBBQ $0, t2; \ |
| 1832 | SBBQ p256const1<>(SB), t3;\ | | 1912 | SBBQ p256const1<>(SB), t3; \ |
| 1833 | SBBQ $0, mul0;\ | | 1913 | SBBQ $0, mul0; \ |
| 1834 | CMOVQCS acc4, t0;\ | | 1914 | CMOVQCS acc4, t0; \ |
| 1835 | CMOVQCS acc5, t1;\ | | 1915 | CMOVQCS acc5, t1; \ |
| 1836 | CMOVQCS acc6, t2;\ | | 1916 | CMOVQCS acc6, t2; \ |
| 1837 | CMOVQCS acc7, t3; | | 1917 | CMOVQCS acc7, t3 |
| | | | 1918 |
|
| 1838 | /* ---------------------------------------*/ | | 1919 | // --------------------------------------- |
| #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 | | #define LDacc(src) MOVQ src(8*0), acc4; MOVQ src(8*1), acc5; MOVQ src(8*2), acc6; MOVQ src(8*3), acc7 |
| #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 | | #define LDt(src) MOVQ src(8*0), t0; MOVQ src(8*1), t1; MOVQ src(8*2), t2; MOVQ src(8*3), t3 |
| #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) | | #define ST(dst) MOVQ acc4, dst(8*0); MOVQ acc5, dst(8*1); MOVQ acc6, dst(8*2); MOVQ acc7, dst(8*3) |
| #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) | | #define STt(dst) MOVQ t0, dst(8*0); MOVQ t1, dst(8*1); MOVQ t2, dst(8*2); MOVQ t3, dst(8*3) |
| #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 | | #define acc2t MOVQ acc4, t0; MOVQ acc5, t1; MOVQ acc6, t2; MOVQ acc7, t3 |
| #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 | | #define t2acc MOVQ t0, acc4; MOVQ t1, acc5; MOVQ t2, acc6; MOVQ t3, acc7 |
| 1845 | /* ---------------------------------------*/ | | 1926 | // --------------------------------------- |
| #define x1in(off) (32*0 + off)(SP) | | #define x1in(off) (32*0 + off)(SP) |
| #define y1in(off) (32*1 + off)(SP) | | #define y1in(off) (32*1 + off)(SP) |
| #define z1in(off) (32*2 + off)(SP) | | #define z1in(off) (32*2 + off)(SP) |
| #define x2in(off) (32*3 + off)(SP) | | #define x2in(off) (32*3 + off)(SP) |
| #define y2in(off) (32*4 + off)(SP) | | #define y2in(off) (32*4 + off)(SP) |
| #define xout(off) (32*5 + off)(SP) | | #define xout(off) (32*5 + off)(SP) |
| #define yout(off) (32*6 + off)(SP) | | #define yout(off) (32*6 + off)(SP) |
| #define zout(off) (32*7 + off)(SP) | | #define zout(off) (32*7 + off)(SP) |
| #define s2(off) (32*8 + off)(SP) | | #define s2(off) (32*8 + off)(SP) |
| #define z1sqr(off) (32*9 + off)(SP) | | #define z1sqr(off) (32*9 + off)(SP) |
| #define h(off) (32*10 + off)(SP) | | #define h(off) (32*10 + off)(SP) |
| #define r(off) (32*11 + off)(SP) | | #define r(off) (32*11 + off)(SP) |
| #define hsqr(off) (32*12 + off)(SP) | | #define hsqr(off) (32*12 + off)(SP) |
| #define rsqr(off) (32*13 + off)(SP) | | #define rsqr(off) (32*13 + off)(SP) |
| #define hcub(off) (32*14 + off)(SP) | | #define hcub(off) (32*14 + off)(SP) |
| #define rptr (32*15)(SP) | | #define rptr (32*15)(SP) |
| #define sel_save (32*15 + 8)(SP) | | #define sel_save (32*15 + 8)(SP) |
| #define zero_save (32*15 + 8 + 4)(SP) | | #define zero_save (32*15 + 8 + 4)(SP) |
| | | |
| // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) | | // func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) |
| 1866 | TEXT ·p256PointAddAffineAsm(SB),0,$512-96 | | 1947 | TEXT ·p256PointAddAffineAsm(SB), 0, $512-96 |
| // Move input to stack in order to free registers | | // Move input to stack in order to free registers |
| MOVQ res+0(FP), AX | | MOVQ res+0(FP), AX |
| MOVQ in1+24(FP), BX | | MOVQ in1+24(FP), BX |
| MOVQ in2+48(FP), CX | | MOVQ in2+48(FP), CX |
| MOVQ sign+72(FP), DX | | MOVQ sign+72(FP), DX |
| MOVQ sel+80(FP), t1 | | MOVQ sel+80(FP), t1 |
| MOVQ zero+88(FP), t2 | | MOVQ zero+88(FP), t2 |
| | | |
| MOVOU (16*0)(BX), X0 | | MOVOU (16*0)(BX), X0 |
| MOVOU (16*1)(BX), X1 | | MOVOU (16*1)(BX), X1 |
| MOVOU (16*2)(BX), X2 | | MOVOU (16*2)(BX), X2 |
| MOVOU (16*3)(BX), X3 | | MOVOU (16*3)(BX), X3 |
| MOVOU (16*4)(BX), X4 | | MOVOU (16*4)(BX), X4 |
| MOVOU (16*5)(BX), X5 | | MOVOU (16*5)(BX), X5 |
| | | |
| MOVOU X0, x1in(16*0) | | MOVOU X0, x1in(16*0) |
| MOVOU X1, x1in(16*1) | | MOVOU X1, x1in(16*1) |
| MOVOU X2, y1in(16*0) | | MOVOU X2, y1in(16*0) |
| MOVOU X3, y1in(16*1) | | MOVOU X3, y1in(16*1) |
| MOVOU X4, z1in(16*0) | | MOVOU X4, z1in(16*0) |
| MOVOU X5, z1in(16*1) | | MOVOU X5, z1in(16*1) |
| | | |
| MOVOU (16*0)(CX), X0 | | MOVOU (16*0)(CX), X0 |
| MOVOU (16*1)(CX), X1 | | MOVOU (16*1)(CX), X1 |
| | | |
| MOVOU X0, x2in(16*0) | | MOVOU X0, x2in(16*0) |
| MOVOU X1, x2in(16*1) | | MOVOU X1, x2in(16*1) |
| | | | 1975 |
|
| // Store pointer to result | | // Store pointer to result |
| MOVQ mul0, rptr | | MOVQ mul0, rptr |
| MOVL t1, sel_save | | MOVL t1, sel_save |
| MOVL t2, zero_save | | MOVL t2, zero_save |
| | | | 1980 |
|
| // Negate y2in based on sign | | // Negate y2in based on sign |
| MOVQ (16*2 + 8*0)(CX), acc4 | | MOVQ (16*2 + 8*0)(CX), acc4 |
| MOVQ (16*2 + 8*1)(CX), acc5 | | MOVQ (16*2 + 8*1)(CX), acc5 |
| MOVQ (16*2 + 8*2)(CX), acc6 | | MOVQ (16*2 + 8*2)(CX), acc6 |
| MOVQ (16*2 + 8*3)(CX), acc7 | | MOVQ (16*2 + 8*3)(CX), acc7 |
| MOVQ $-1, acc0 | | MOVQ $-1, acc0 |
| MOVQ p256const0<>(SB), acc1 | | MOVQ p256const0<>(SB), acc1 |
| MOVQ $0, acc2 | | MOVQ $0, acc2 |
| MOVQ p256const1<>(SB), acc3 | | MOVQ p256const1<>(SB), acc3 |
| XORQ mul0, mul0 | | XORQ mul0, mul0 |
| | | | 1991 |
|
| // Speculatively subtract | | // Speculatively subtract |
| SUBQ acc4, acc0 | | SUBQ acc4, acc0 |
| SBBQ acc5, acc1 | | SBBQ acc5, acc1 |
| SBBQ acc6, acc2 | | SBBQ acc6, acc2 |
| SBBQ acc7, acc3 | | SBBQ acc7, acc3 |
| SBBQ $0, mul0 | | SBBQ $0, mul0 |
| MOVQ acc0, t0 | | MOVQ acc0, t0 |
| MOVQ acc1, t1 | | MOVQ acc1, t1 |
| MOVQ acc2, t2 | | MOVQ acc2, t2 |
| MOVQ acc3, t3 | | MOVQ acc3, t3 |
| | | | 2002 |
|
| // Add in case the operand was > p256 | | // Add in case the operand was > p256 |
| 1919 | ADDQ $-1, acc0 | | 2004 | ADDQ $-1, acc0 |
| 1920 | ADCQ p256const0<>(SB), acc1 | | 2005 | ADCQ p256const0<>(SB), acc1 |
| 1921 | ADCQ $0, acc2 | | 2006 | ADCQ $0, acc2 |
| 1922 | ADCQ p256const1<>(SB), acc3 | | 2007 | ADCQ p256const1<>(SB), acc3 |
| 1923 | ADCQ $0, mul0 | | 2008 | ADCQ $0, mul0 |
| CMOVQNE t0, acc0 | | CMOVQNE t0, acc0 |
| CMOVQNE t1, acc1 | | CMOVQNE t1, acc1 |
| CMOVQNE t2, acc2 | | CMOVQNE t2, acc2 |
| CMOVQNE t3, acc3 | | CMOVQNE t3, acc3 |
| | | | 2013 |
|
| // If condition is 0, keep original value | | // If condition is 0, keep original value |
| 1929 | TESTQ DX, DX | | 2015 | TESTQ DX, DX |
| CMOVQEQ acc4, acc0 | | CMOVQEQ acc4, acc0 |
| CMOVQEQ acc5, acc1 | | CMOVQEQ acc5, acc1 |
| CMOVQEQ acc6, acc2 | | CMOVQEQ acc6, acc2 |
| CMOVQEQ acc7, acc3 | | CMOVQEQ acc7, acc3 |
| | | | 2020 |
|
| // Store result | | // Store result |
| MOVQ acc0, y2in(8*0) | | MOVQ acc0, y2in(8*0) |
| MOVQ acc1, y2in(8*1) | | MOVQ acc1, y2in(8*1) |
| MOVQ acc2, y2in(8*2) | | MOVQ acc2, y2in(8*2) |
| MOVQ acc3, y2in(8*3) | | MOVQ acc3, y2in(8*3) |
| | | | 2026 |
|
| // Begin point add | | // Begin point add |
| LDacc (z1in) | | LDacc (z1in) |
| 1941 | CALL p256SqrInternal(SB) // z1ˆ2 | | 2029 | CALL p256SqrInternal(SB) // z1ˆ2 |
| ST (z1sqr) | | ST (z1sqr) |
| | | |
| LDt (x2in) | | LDt (x2in) |
| 1945 | CALL p256MulInternal(SB) // x2 * z1ˆ2 | | 2033 | CALL p256MulInternal(SB) // x2 * z1ˆ2 |
| | | |
| LDt (x1in) | | LDt (x1in) |
| 1948 | CALL p256SubInternal(SB) // h = u2 - u1 | | 2036 | CALL p256SubInternal(SB) // h = u2 - u1 |
| ST (h) | | ST (h) |
| | | |
| LDt (z1in) | | LDt (z1in) |
| 1952 | CALL p256MulInternal(SB) // z3 = h * z1 | | 2040 | CALL p256MulInternal(SB) // z3 = h * z1 |
| ST (zout) | | ST (zout) |
| | | |
| LDacc (z1sqr) | | LDacc (z1sqr) |
| 1956 | CALL p256MulInternal(SB) // z1ˆ3 | | 2044 | CALL p256MulInternal(SB) // z1ˆ3 |
| | | |
| LDt (y2in) | | LDt (y2in) |
| 1959 | CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 | | 2047 | CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 |
| ST (s2) | | ST (s2) |
| | | |
| LDt (y1in) | | LDt (y1in) |
| 1963 | CALL p256SubInternal(SB) // r = s2 - s1 | | 2051 | CALL p256SubInternal(SB) // r = s2 - s1 |
| ST (r) | | ST (r) |
| | | |
| 1966 | CALL p256SqrInternal(SB) // rsqr = rˆ2 | | 2054 | CALL p256SqrInternal(SB) // rsqr = rˆ2 |
| ST (rsqr) | | ST (rsqr) |
| | | |
| LDacc (h) | | LDacc (h) |
| 1970 | CALL p256SqrInternal(SB) // hsqr = hˆ2 | | 2058 | CALL p256SqrInternal(SB) // hsqr = hˆ2 |
| ST (hsqr) | | ST (hsqr) |
| | | |
| LDt (h) | | LDt (h) |
| 1974 | CALL p256MulInternal(SB) // hcub = hˆ3 | | 2062 | CALL p256MulInternal(SB) // hcub = hˆ3 |
| ST (hcub) | | ST (hcub) |
| | | |
| LDt (y1in) | | LDt (y1in) |
| 1978 | CALL p256MulInternal(SB) // y1 * hˆ3 | | 2066 | CALL p256MulInternal(SB) // y1 * hˆ3 |
| ST (s2) | | ST (s2) |
| | | |
| LDacc (x1in) | | LDacc (x1in) |
| LDt (hsqr) | | LDt (hsqr) |
| 1983 | CALL p256MulInternal(SB) // u1 * hˆ2 | | 2071 | CALL p256MulInternal(SB) // u1 * hˆ2 |
| ST (h) | | ST (h) |
| | | |
| 1986 | p256MulBy2Inline // u1 * hˆ2 * 2, inline | | 2074 | p256MulBy2Inline // u1 * hˆ2 * 2, inline |
| LDacc (rsqr) | | LDacc (rsqr) |
| 1988 | CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 | | 2076 | CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 |
| | | |
| LDt (hcub) | | LDt (hcub) |
| CALL p256SubInternal(SB) | | CALL p256SubInternal(SB) |
| ST (xout) | | ST (xout) |
| | | |
| MOVQ acc4, t0 | | MOVQ acc4, t0 |
| MOVQ acc5, t1 | | MOVQ acc5, t1 |
| MOVQ acc6, t2 | | MOVQ acc6, t2 |
| MOVQ acc7, t3 | | MOVQ acc7, t3 |
| LDacc (h) | | LDacc (h) |
| CALL p256SubInternal(SB) | | CALL p256SubInternal(SB) |
| | | |
| LDt (r) | | LDt (r) |
| CALL p256MulInternal(SB) | | CALL p256MulInternal(SB) |
| | | |
| LDt (s2) | | LDt (s2) |
| CALL p256SubInternal(SB) | | CALL p256SubInternal(SB) |
| ST (yout) | | ST (yout) |
| | | | 2095 |
|
| // Load stored values from stack | | // Load stored values from stack |
| MOVQ rptr, AX | | MOVQ rptr, AX |
| MOVL sel_save, BX | | MOVL sel_save, BX |
| MOVL zero_save, CX | | MOVL zero_save, CX |
| | | | 2100 |
|
| // The result is not valid if (sel == 0), conditional choose | | // The result is not valid if (sel == 0), conditional choose |
| MOVOU xout(16*0), X0 | | MOVOU xout(16*0), X0 |
| MOVOU xout(16*1), X1 | | MOVOU xout(16*1), X1 |
| MOVOU yout(16*0), X2 | | MOVOU yout(16*0), X2 |
| MOVOU yout(16*1), X3 | | MOVOU yout(16*1), X3 |
| MOVOU zout(16*0), X4 | | MOVOU zout(16*0), X4 |
| MOVOU zout(16*1), X5 | | MOVOU zout(16*1), X5 |
| | | |
| MOVL BX, X6 | | MOVL BX, X6 |
| MOVL CX, X7 | | MOVL CX, X7 |
| | | |
| 2022 | PXOR X8, X8 | | 2112 | PXOR X8, X8 |
| PCMPEQL X9, X9 | | PCMPEQL X9, X9 |
| | | |
| PSHUFD $0, X6, X6 | | PSHUFD $0, X6, X6 |
| PSHUFD $0, X7, X7 | | PSHUFD $0, X7, X7 |
| | | |
| PCMPEQL X8, X6 | | PCMPEQL X8, X6 |
| PCMPEQL X8, X7 | | PCMPEQL X8, X7 |
| | | |
| MOVOU X6, X15 | | MOVOU X6, X15 |
| PANDN X9, X15 | | PANDN X9, X15 |
| | | |
| MOVOU x1in(16*0), X9 | | MOVOU x1in(16*0), X9 |
| MOVOU x1in(16*1), X10 | | MOVOU x1in(16*1), X10 |
| MOVOU y1in(16*0), X11 | | MOVOU y1in(16*0), X11 |
| MOVOU y1in(16*1), X12 | | MOVOU y1in(16*1), X12 |
| MOVOU z1in(16*0), X13 | | MOVOU z1in(16*0), X13 |
| MOVOU z1in(16*1), X14 | | MOVOU z1in(16*1), X14 |
| | | |
| PAND X15, X0 | | PAND X15, X0 |
| PAND X15, X1 | | PAND X15, X1 |
| PAND X15, X2 | | PAND X15, X2 |
| PAND X15, X3 | | PAND X15, X3 |
| PAND X15, X4 | | PAND X15, X4 |
| PAND X15, X5 | | PAND X15, X5 |
| | | |
| PAND X6, X9 | | PAND X6, X9 |
| PAND X6, X10 | | PAND X6, X10 |
| PAND X6, X11 | | PAND X6, X11 |
| PAND X6, X12 | | PAND X6, X12 |
| PAND X6, X13 | | PAND X6, X13 |
| PAND X6, X14 | | PAND X6, X14 |
| | | |
| PXOR X9, X0 | | PXOR X9, X0 |
| PXOR X10, X1 | | PXOR X10, X1 |
| PXOR X11, X2 | | PXOR X11, X2 |
| PXOR X12, X3 | | PXOR X12, X3 |
| PXOR X13, X4 | | PXOR X13, X4 |
| PXOR X14, X5 | | PXOR X14, X5 |
| | | | 2151 |
|
| // Similarly if zero == 0 | | // Similarly if zero == 0 |
| PCMPEQL X9, X9 | | PCMPEQL X9, X9 |
| 2063 | MOVOU X7, X15 | | 2154 | MOVOU X7, X15 |
| 2064 | PANDN X9, X15 | | 2155 | PANDN X9, X15 |
| | | |
| MOVOU x2in(16*0), X9 | | MOVOU x2in(16*0), X9 |
| MOVOU x2in(16*1), X10 | | MOVOU x2in(16*1), X10 |
| MOVOU y2in(16*0), X11 | | MOVOU y2in(16*0), X11 |
| MOVOU y2in(16*1), X12 | | MOVOU y2in(16*1), X12 |
| MOVOU p256one<>+0x00(SB), X13 | | MOVOU p256one<>+0x00(SB), X13 |
| MOVOU p256one<>+0x10(SB), X14 | | MOVOU p256one<>+0x10(SB), X14 |
| | | |
| PAND X15, X0 | | PAND X15, X0 |
| PAND X15, X1 | | PAND X15, X1 |
| PAND X15, X2 | | PAND X15, X2 |
| PAND X15, X3 | | PAND X15, X3 |
| PAND X15, X4 | | PAND X15, X4 |
| PAND X15, X5 | | PAND X15, X5 |
| | | |
| PAND X7, X9 | | PAND X7, X9 |
| PAND X7, X10 | | PAND X7, X10 |
| PAND X7, X11 | | PAND X7, X11 |
| PAND X7, X12 | | PAND X7, X12 |
| PAND X7, X13 | | PAND X7, X13 |
| PAND X7, X14 | | PAND X7, X14 |
| | | |
| PXOR X9, X0 | | PXOR X9, X0 |
| PXOR X10, X1 | | PXOR X10, X1 |
| PXOR X11, X2 | | PXOR X11, X2 |
| PXOR X12, X3 | | PXOR X12, X3 |
| PXOR X13, X4 | | PXOR X13, X4 |
| PXOR X14, X5 | | PXOR X14, X5 |
| | | | 2184 |
|
| // Finally output the result | | // Finally output the result |
| MOVOU X0, (16*0)(AX) | | MOVOU X0, (16*0)(AX) |
| MOVOU X1, (16*1)(AX) | | MOVOU X1, (16*1)(AX) |
| MOVOU X2, (16*2)(AX) | | MOVOU X2, (16*2)(AX) |
| MOVOU X3, (16*3)(AX) | | MOVOU X3, (16*3)(AX) |
| MOVOU X4, (16*4)(AX) | | MOVOU X4, (16*4)(AX) |
| MOVOU X5, (16*5)(AX) | | MOVOU X5, (16*5)(AX) |
| 2100 | MOVQ $0, rptr | | 2192 | MOVQ $0, rptr |
| | | |
| RET | | RET |
| | | | 2195 |
|
| #undef x1in | | #undef x1in |
| #undef y1in | | #undef y1in |
| #undef z1in | | #undef z1in |
| #undef x2in | | #undef x2in |
| #undef y2in | | #undef y2in |
| #undef xout | | #undef xout |
| #undef yout | | #undef yout |
| #undef zout | | #undef zout |
| #undef s2 | | #undef s2 |
| #undef z1sqr | | #undef z1sqr |
| #undef h | | #undef h |
| #undef r | | #undef r |
| #undef hsqr | | #undef hsqr |
| #undef rsqr | | #undef rsqr |
| #undef hcub | | #undef hcub |
| #undef rptr | | #undef rptr |
| #undef sel_save | | #undef sel_save |
| #undef zero_save | | #undef zero_save |
| 2121 | /* ---------------------------------------*/ | | 2214 | // --------------------------------------- |
| #define x1in(off) (32*0 + off)(SP) | | #define x1in(off) (32*0 + off)(SP) |
| #define y1in(off) (32*1 + off)(SP) | | #define y1in(off) (32*1 + off)(SP) |
| #define z1in(off) (32*2 + off)(SP) | | #define z1in(off) (32*2 + off)(SP) |
| #define x2in(off) (32*3 + off)(SP) | | #define x2in(off) (32*3 + off)(SP) |
| #define y2in(off) (32*4 + off)(SP) | | #define y2in(off) (32*4 + off)(SP) |
| #define z2in(off) (32*5 + off)(SP) | | #define z2in(off) (32*5 + off)(SP) |
| | | |
| #define xout(off) (32*6 + off)(SP) | | #define xout(off) (32*6 + off)(SP) |
| #define yout(off) (32*7 + off)(SP) | | #define yout(off) (32*7 + off)(SP) |
| #define zout(off) (32*8 + off)(SP) | | #define zout(off) (32*8 + off)(SP) |
| | | |
| #define u1(off) (32*9 + off)(SP) | | #define u1(off) (32*9 + off)(SP) |
| #define u2(off) (32*10 + off)(SP) | | #define u2(off) (32*10 + off)(SP) |
| #define s1(off) (32*11 + off)(SP) | | #define s1(off) (32*11 + off)(SP) |
| #define s2(off) (32*12 + off)(SP) | | #define s2(off) (32*12 + off)(SP) |
| #define z1sqr(off) (32*13 + off)(SP) | | #define z1sqr(off) (32*13 + off)(SP) |
| #define z2sqr(off) (32*14 + off)(SP) | | #define z2sqr(off) (32*14 + off)(SP) |
| #define h(off) (32*15 + off)(SP) | | #define h(off) (32*15 + off)(SP) |
| #define r(off) (32*16 + off)(SP) | | #define r(off) (32*16 + off)(SP) |
| #define hsqr(off) (32*17 + off)(SP) | | #define hsqr(off) (32*17 + off)(SP) |
| #define rsqr(off) (32*18 + off)(SP) | | #define rsqr(off) (32*18 + off)(SP) |
| #define hcub(off) (32*19 + off)(SP) | | #define hcub(off) (32*19 + off)(SP) |
| #define rptr (32*20)(SP) | | #define rptr (32*20)(SP) |
| | | |
| 2146 | //func p256PointAddAsm(res, in1, in2 []uint64) | | 2239 | // func p256PointAddAsm(res, in1, in2 []uint64) |
| 2147 | TEXT ·p256PointAddAsm(SB),0,$672-76 | | 2240 | TEXT ·p256PointAddAsm(SB), 0, $672-76 |
| // Move input to stack in order to free registers | | // Move input to stack in order to free registers |
| MOVQ res+0(FP), AX | | MOVQ res+0(FP), AX |
| MOVQ in1+24(FP), BX | | MOVQ in1+24(FP), BX |
| MOVQ in2+48(FP), CX | | MOVQ in2+48(FP), CX |
| | | |
| MOVOU (16*0)(BX), X0 | | MOVOU (16*0)(BX), X0 |
| MOVOU (16*1)(BX), X1 | | MOVOU (16*1)(BX), X1 |
| MOVOU (16*2)(BX), X2 | | MOVOU (16*2)(BX), X2 |
| MOVOU (16*3)(BX), X3 | | MOVOU (16*3)(BX), X3 |
| MOVOU (16*4)(BX), X4 | | MOVOU (16*4)(BX), X4 |
| MOVOU (16*5)(BX), X5 | | MOVOU (16*5)(BX), X5 |
| | | |
| MOVOU X0, x1in(16*0) | | MOVOU X0, x1in(16*0) |
| MOVOU X1, x1in(16*1) | | MOVOU X1, x1in(16*1) |
| MOVOU X2, y1in(16*0) | | MOVOU X2, y1in(16*0) |
| MOVOU X3, y1in(16*1) | | MOVOU X3, y1in(16*1) |
| MOVOU X4, z1in(16*0) | | MOVOU X4, z1in(16*0) |
| MOVOU X5, z1in(16*1) | | MOVOU X5, z1in(16*1) |
| | | |
| MOVOU (16*0)(CX), X0 | | MOVOU (16*0)(CX), X0 |
| MOVOU (16*1)(CX), X1 | | MOVOU (16*1)(CX), X1 |
| MOVOU (16*2)(CX), X2 | | MOVOU (16*2)(CX), X2 |
| MOVOU (16*3)(CX), X3 | | MOVOU (16*3)(CX), X3 |
| MOVOU (16*4)(CX), X4 | | MOVOU (16*4)(CX), X4 |
| MOVOU (16*5)(CX), X5 | | MOVOU (16*5)(CX), X5 |
| | | |
| MOVOU X0, x2in(16*0) | | MOVOU X0, x2in(16*0) |
| MOVOU X1, x2in(16*1) | | MOVOU X1, x2in(16*1) |
| MOVOU X2, y2in(16*0) | | MOVOU X2, y2in(16*0) |
| MOVOU X3, y2in(16*1) | | MOVOU X3, y2in(16*1) |
| MOVOU X4, z2in(16*0) | | MOVOU X4, z2in(16*0) |
| MOVOU X5, z2in(16*1) | | MOVOU X5, z2in(16*1) |
| | | | 2273 |
|
| // Store pointer to result | | // Store pointer to result |
| MOVQ AX, rptr | | MOVQ AX, rptr |
| | | | 2276 |
|
| // Begin point add | | // Begin point add |
| LDacc (z2in) | | LDacc (z2in) |
| 2184 | CALL p256SqrInternal(SB) // z2ˆ2 | | 2279 | CALL p256SqrInternal(SB) // z2ˆ2 |
| ST (z2sqr) | | ST (z2sqr) |
| LDt (z2in) | | LDt (z2in) |
| 2187 | CALL p256MulInternal(SB) // z2ˆ3 | | 2282 | CALL p256MulInternal(SB) // z2ˆ3 |
| LDt (y1in) | | LDt (y1in) |
| 2189 | CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 | | 2284 | CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 |
| ST (s1) | | ST (s1) |
| | | |
| LDacc (z1in) | | LDacc (z1in) |
| 2193 | CALL p256SqrInternal(SB) // z1ˆ2 | | 2288 | CALL p256SqrInternal(SB) // z1ˆ2 |
| ST (z1sqr) | | ST (z1sqr) |
| LDt (z1in) | | LDt (z1in) |
| 2196 | CALL p256MulInternal(SB) // z1ˆ3 | | 2291 | CALL p256MulInternal(SB) // z1ˆ3 |
| LDt (y2in) | | LDt (y2in) |
| 2198 | CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 | | 2293 | CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 |
| ST (s2) | | ST (s2) |
| | | |
| LDt (s1) | | LDt (s1) |
| 2202 | CALL p256SubInternal(SB) // r = s2 - s1 | | 2297 | CALL p256SubInternal(SB) // r = s2 - s1 |
| ST (r) | | ST (r) |
| | | |
| LDacc (z2sqr) | | LDacc (z2sqr) |
| LDt (x1in) | | LDt (x1in) |
| 2207 | CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 | | 2302 | CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 |
| ST (u1) | | ST (u1) |
| LDacc (z1sqr) | | LDacc (z1sqr) |
| LDt (x2in) | | LDt (x2in) |
| 2211 | CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 | | 2306 | CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 |
| ST (u2) | | ST (u2) |
| | | |
| LDt (u1) | | LDt (u1) |
| 2215 | CALL p256SubInternal(SB) // h = u2 - u1 | | 2310 | CALL p256SubInternal(SB) // h = u2 - u1 |
| ST (h) | | ST (h) |
| | | |
| LDacc (r) | | LDacc (r) |
| 2219 | CALL p256SqrInternal(SB) // rsqr = rˆ2 | | 2314 | CALL p256SqrInternal(SB) // rsqr = rˆ2 |
| ST (rsqr) | | ST (rsqr) |
| | | |
| LDacc (h) | | LDacc (h) |
| 2223 | CALL p256SqrInternal(SB) // hsqr = hˆ2 | | 2318 | CALL p256SqrInternal(SB) // hsqr = hˆ2 |
| ST (hsqr) | | ST (hsqr) |
| | | |
| LDt (h) | | LDt (h) |
| 2227 | CALL p256MulInternal(SB) // hcub = hˆ3 | | 2322 | CALL p256MulInternal(SB) // hcub = hˆ3 |
| ST (hcub) | | ST (hcub) |
| | | |
| LDt (s1) | | LDt (s1) |
| CALL p256MulInternal(SB) | | CALL p256MulInternal(SB) |
| ST (s2) | | ST (s2) |
| | | |
| LDacc (z1in) | | LDacc (z1in) |
| LDt (z2in) | | LDt (z2in) |
| 2236 | CALL p256MulInternal(SB) // z1 * z2 | | 2331 | CALL p256MulInternal(SB) // z1 * z2 |
| LDt (h) | | LDt (h) |
| 2238 | CALL p256MulInternal(SB) // z1 * z2 * h | | 2333 | CALL p256MulInternal(SB) // z1 * z2 * h |
| ST (zout) | | ST (zout) |
| | | |
| LDacc (hsqr) | | LDacc (hsqr) |
| LDt (u1) | | LDt (u1) |
| 2243 | CALL p256MulInternal(SB) // hˆ2 * u1 | | 2338 | CALL p256MulInternal(SB) // hˆ2 * u1 |
| ST (u2) | | ST (u2) |
| | | |
| 2246 | p256MulBy2Inline // u1 * hˆ2 * 2, inline | | 2341 | p256MulBy2Inline // u1 * hˆ2 * 2, inline |
| LDacc (rsqr) | | LDacc (rsqr) |
| 2248 | CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 | | 2343 | CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 |
| | | |
| LDt (hcub) | | LDt (hcub) |
| CALL p256SubInternal(SB) | | CALL p256SubInternal(SB) |
| ST (xout) | | ST (xout) |
| | | |
| MOVQ acc4, t0 | | MOVQ acc4, t0 |
| MOVQ acc5, t1 | | MOVQ acc5, t1 |
| MOVQ acc6, t2 | | MOVQ acc6, t2 |
| MOVQ acc7, t3 | | MOVQ acc7, t3 |
| LDacc (u2) | | LDacc (u2) |
| CALL p256SubInternal(SB) | | CALL p256SubInternal(SB) |
| | | |
| LDt (r) | | LDt (r) |
| CALL p256MulInternal(SB) | | CALL p256MulInternal(SB) |
| | | |
| LDt (s2) | | LDt (s2) |
| CALL p256SubInternal(SB) | | CALL p256SubInternal(SB) |
| ST (yout) | | ST (yout) |
| | | |
| MOVOU xout(16*0), X0 | | MOVOU xout(16*0), X0 |
| MOVOU xout(16*1), X1 | | MOVOU xout(16*1), X1 |
| MOVOU yout(16*0), X2 | | MOVOU yout(16*0), X2 |
| MOVOU yout(16*1), X3 | | MOVOU yout(16*1), X3 |
| MOVOU zout(16*0), X4 | | MOVOU zout(16*0), X4 |
| MOVOU zout(16*1), X5 | | MOVOU zout(16*1), X5 |
| | | | 2369 |
|
| // Finally output the result | | // Finally output the result |
| 2275 | MOVQ rptr, AX | | 2371 | MOVQ rptr, AX |
| 2276 | MOVQ $0, rptr | | 2372 | MOVQ $0, rptr |
| MOVOU X0, (16*0)(AX) | | MOVOU X0, (16*0)(AX) |
| MOVOU X1, (16*1)(AX) | | MOVOU X1, (16*1)(AX) |
| MOVOU X2, (16*2)(AX) | | MOVOU X2, (16*2)(AX) |
| MOVOU X3, (16*3)(AX) | | MOVOU X3, (16*3)(AX) |
| MOVOU X4, (16*4)(AX) | | MOVOU X4, (16*4)(AX) |
| MOVOU X5, (16*5)(AX) | | MOVOU X5, (16*5)(AX) |
| | | |
| RET | | RET |
| | | | 2381 |
|
| #undef x1in | | #undef x1in |
| #undef y1in | | #undef y1in |
| #undef z1in | | #undef z1in |
| #undef x2in | | #undef x2in |
| #undef y2in | | #undef y2in |
| #undef z2in | | #undef z2in |
| #undef xout | | #undef xout |
| #undef yout | | #undef yout |
| #undef zout | | #undef zout |
| #undef s1 | | #undef s1 |
| #undef s2 | | #undef s2 |
| #undef u1 | | #undef u1 |
| #undef u2 | | #undef u2 |
| #undef z1sqr | | #undef z1sqr |
| #undef z2sqr | | #undef z2sqr |
| #undef h | | #undef h |
| #undef r | | #undef r |
| #undef hsqr | | #undef hsqr |
| #undef rsqr | | #undef rsqr |
| #undef hcub | | #undef hcub |
| #undef rptr | | #undef rptr |
| 2306 | /* ---------------------------------------*/ | | 2403 | // --------------------------------------- |
| #define x(off) (32*0 + off)(SP) | | #define x(off) (32*0 + off)(SP) |
| #define y(off) (32*1 + off)(SP) | | #define y(off) (32*1 + off)(SP) |
| #define z(off) (32*2 + off)(SP) | | #define z(off) (32*2 + off)(SP) |
| | | |
| #define s(off) (32*3 + off)(SP) | | #define s(off) (32*3 + off)(SP) |
| #define m(off) (32*4 + off)(SP) | | #define m(off) (32*4 + off)(SP) |
| #define zsqr(off) (32*5 + off)(SP) | | #define zsqr(off) (32*5 + off)(SP) |
| #define tmp(off) (32*6 + off)(SP) | | #define tmp(off) (32*6 + off)(SP) |
| #define rptr (32*7)(SP) | | #define rptr (32*7)(SP) |
| | | |
| 2317 | //func p256PointDoubleAsm(res, in []uint64) | | 2414 | // func p256PointDoubleAsm(res, in []uint64) |
| 2318 | TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$256-48 | | 2415 | TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $256-48 |
| // Move input to stack in order to free registers | | // Move input to stack in order to free registers |
| MOVQ res+0(FP), AX | | MOVQ res+0(FP), AX |
| MOVQ in+24(FP), BX | | MOVQ in+24(FP), BX |
| | | |
| MOVOU (16*0)(BX), X0 | | MOVOU (16*0)(BX), X0 |
| MOVOU (16*1)(BX), X1 | | MOVOU (16*1)(BX), X1 |
| MOVOU (16*2)(BX), X2 | | MOVOU (16*2)(BX), X2 |
| MOVOU (16*3)(BX), X3 | | MOVOU (16*3)(BX), X3 |
| MOVOU (16*4)(BX), X4 | | MOVOU (16*4)(BX), X4 |
| MOVOU (16*5)(BX), X5 | | MOVOU (16*5)(BX), X5 |
| | | |
| MOVOU X0, x(16*0) | | MOVOU X0, x(16*0) |
| MOVOU X1, x(16*1) | | MOVOU X1, x(16*1) |
| MOVOU X2, y(16*0) | | MOVOU X2, y(16*0) |
| MOVOU X3, y(16*1) | | MOVOU X3, y(16*1) |
| MOVOU X4, z(16*0) | | MOVOU X4, z(16*0) |
| MOVOU X5, z(16*1) | | MOVOU X5, z(16*1) |
| | | | 2433 |
|
| // Store pointer to result | | // Store pointer to result |
| MOVQ AX, rptr | | MOVQ AX, rptr |
| | | | 2436 |
|
| // Begin point double | | // Begin point double |
| LDacc (z) | | LDacc (z) |
| CALL p256SqrInternal(SB) | | CALL p256SqrInternal(SB) |
| ST (zsqr) | | ST (zsqr) |
| | | |
| LDt (x) | | LDt (x) |
| p256AddInline | | p256AddInline |
| STt (m) | | STt (m) |
| | | |
| LDacc (z) | | LDacc (z) |
| LDt (y) | | LDt (y) |
| CALL p256MulInternal(SB) | | CALL p256MulInternal(SB) |
| p256MulBy2Inline | | p256MulBy2Inline |
| MOVQ rptr, AX | | MOVQ rptr, AX |
| | | | 2451 |
|
| // Store z | | // Store z |
| MOVQ t0, (16*4 + 8*0)(AX) | | MOVQ t0, (16*4 + 8*0)(AX) |
| MOVQ t1, (16*4 + 8*1)(AX) | | MOVQ t1, (16*4 + 8*1)(AX) |
| MOVQ t2, (16*4 + 8*2)(AX) | | MOVQ t2, (16*4 + 8*2)(AX) |
| MOVQ t3, (16*4 + 8*3)(AX) | | MOVQ t3, (16*4 + 8*3)(AX) |
| | | |
| LDacc (x) | | LDacc (x) |
| LDt (zsqr) | | LDt (zsqr) |
| CALL p256SubInternal(SB) | | CALL p256SubInternal(SB) |
| LDt (m) | | LDt (m) |
| CALL p256MulInternal(SB) | | CALL p256MulInternal(SB) |
| ST (m) | | ST (m) |
| | | | 2464 |
|
| // Multiply by 3 | | // Multiply by 3 |
| p256MulBy2Inline | | p256MulBy2Inline |
| LDacc (m) | | LDacc (m) |
| p256AddInline | | p256AddInline |
| STt (m) | | STt (m) |
| | | | 2470 |
|
| //////////////////////// | | //////////////////////// |
| LDacc (y) | | LDacc (y) |
| p256MulBy2Inline | | p256MulBy2Inline |
| t2acc | | t2acc |
| 2373 | CALL p256SqrInternal(SB) | | 2475 | CALL p256SqrInternal(SB) |
| ST (s) | | ST (s) |
| 2375 | CALL p256SqrInternal(SB) | | 2477 | CALL p256SqrInternal(SB) |
| | | | 2478 |
|
| // Divide by 2 | | // Divide by 2 |
| XORQ mul0, mul0 | | XORQ mul0, mul0 |
| MOVQ acc4, t0 | | MOVQ acc4, t0 |
| MOVQ acc5, t1 | | MOVQ acc5, t1 |
| MOVQ acc6, t2 | | MOVQ acc6, t2 |
| MOVQ acc7, t3 | | MOVQ acc7, t3 |
| | | |
| 2383 | ADDQ $-1, acc4 | | 2486 | ADDQ $-1, acc4 |
| 2384 | ADCQ p256const0<>(SB), acc5 | | 2487 | ADCQ p256const0<>(SB), acc5 |
| 2385 | ADCQ $0, acc6 | | 2488 | ADCQ $0, acc6 |
| 2386 | ADCQ p256const1<>(SB), acc7 | | 2489 | ADCQ p256const1<>(SB), acc7 |
| 2387 | ADCQ $0, mul0 | | 2490 | ADCQ $0, mul0 |
| TESTQ $1, t0 | | TESTQ $1, t0 |
| | | |
| CMOVQEQ t0, acc4 | | CMOVQEQ t0, acc4 |
| CMOVQEQ t1, acc5 | | CMOVQEQ t1, acc5 |
| CMOVQEQ t2, acc6 | | CMOVQEQ t2, acc6 |
| CMOVQEQ t3, acc7 | | CMOVQEQ t3, acc7 |
| 2394 | ANDQ t0, mul0 | | 2497 | ANDQ t0, mul0 |
| | | |
| SHRQ $1, acc4:acc5 | | SHRQ $1, acc4:acc5 |
| SHRQ $1, acc5:acc6 | | SHRQ $1, acc5:acc6 |
| SHRQ $1, acc6:acc7 | | SHRQ $1, acc6:acc7 |
| SHRQ $1, acc7:mul0 | | SHRQ $1, acc7:mul0 |
| ST (y) | | ST (y) |
| | | | 2504 |
|
| ///////////////////////// | | ///////////////////////// |
| LDacc (x) | | LDacc (x) |
| LDt (s) | | LDt (s) |
| CALL p256MulInternal(SB) | | CALL p256MulInternal(SB) |
| ST (s) | | ST (s) |
| p256MulBy2Inline | | p256MulBy2Inline |
| STt (tmp) | | STt (tmp) |
| | | |
| LDacc (m) | | LDacc (m) |
| CALL p256SqrInternal(SB) | | CALL p256SqrInternal(SB) |
| LDt (tmp) | | LDt (tmp) |
| CALL p256SubInternal(SB) | | CALL p256SubInternal(SB) |
| | | |
| MOVQ rptr, AX | | MOVQ rptr, AX |
| | | | 2519 |
|
| // Store x | | // Store x |
| MOVQ acc4, (16*0 + 8*0)(AX) | | MOVQ acc4, (16*0 + 8*0)(AX) |
| MOVQ acc5, (16*0 + 8*1)(AX) | | MOVQ acc5, (16*0 + 8*1)(AX) |
| MOVQ acc6, (16*0 + 8*2)(AX) | | MOVQ acc6, (16*0 + 8*2)(AX) |
| MOVQ acc7, (16*0 + 8*3)(AX) | | MOVQ acc7, (16*0 + 8*3)(AX) |
| | | |
| acc2t | | acc2t |
| LDacc (s) | | LDacc (s) |
| 2423 | CALL p256SubInternal(SB) | | 2528 | CALL p256SubInternal(SB) |
| | | |
| LDt (m) | | LDt (m) |
| CALL p256MulInternal(SB) | | CALL p256MulInternal(SB) |
| | | |
| LDt (y) | | LDt (y) |
| CALL p256SubInternal(SB) | | CALL p256SubInternal(SB) |
| MOVQ rptr, AX | | MOVQ rptr, AX |
| | | | 2536 |
|
| // Store y | | // Store y |
| MOVQ acc4, (16*2 + 8*0)(AX) | | MOVQ acc4, (16*2 + 8*0)(AX) |
| MOVQ acc5, (16*2 + 8*1)(AX) | | MOVQ acc5, (16*2 + 8*1)(AX) |
| MOVQ acc6, (16*2 + 8*2)(AX) | | MOVQ acc6, (16*2 + 8*2)(AX) |
| MOVQ acc7, (16*2 + 8*3)(AX) | | MOVQ acc7, (16*2 + 8*3)(AX) |
| | | | 2542 |
|
| /////////////////////// | | /////////////////////// |
| MOVQ $0, rptr | | MOVQ $0, rptr |
| | | |
| RET | | RET |
| | | | 2547 |
|
| 2440 | /* ---------------------------------------*/ | | 2548 | // --------------------------------------- |
| | | |