From 16d858373513f19df4ad4957c4bb88d502260b79 Mon Sep 17 00:00:00 2001 From: Kongqun Yang Date: Tue, 2 Aug 2016 10:41:03 -0700 Subject: [PATCH] Update boringssl and curl(v7.50.0) Change-Id: I89b40cf03d1aab9a13b0df449e540ab73d03451e --- DEPS | 8 +- packager/media/base/test/fake_prng.cc | 2 +- .../third_party/boringssl/BUILD.generated.gni | 459 ++++ .../boringssl/BUILD.generated_tests.gni | 597 +++++ packager/third_party/boringssl/BUILD.gn | 251 +- .../third_party/boringssl/README.chromium | 7 +- packager/third_party/boringssl/boringssl.gyp | 148 +- packager/third_party/boringssl/boringssl.gypi | 69 +- .../third_party/boringssl/boringssl_nacl.gyp | 4 - .../third_party/boringssl/boringssl_tests.gyp | 32 +- .../boringssl/boringssl_tests.gypi | 175 +- .../boringssl/boringssl_unittest.cc | 46 + packager/third_party/boringssl/err_data.c | 1310 +++++----- .../linux-aarch64/crypto/aes/aesv8-armx64.S | 10 +- .../linux-aarch64/crypto/bn/armv8-mont.S | 1407 ++++++++++ .../crypto/chacha/chacha-armv8.S | 1971 ++++++++++++++ .../crypto/modes/ghashv8-armx64.S | 29 +- .../linux-aarch64/crypto/sha/sha1-armv8.S | 5 +- .../linux-aarch64/crypto/sha/sha256-armv8.S | 5 +- .../linux-aarch64/crypto/sha/sha512-armv8.S | 5 +- .../linux-arm/crypto/aes/aes-armv4.S | 4 +- .../linux-arm/crypto/aes/aesv8-armx32.S | 10 +- .../linux-arm/crypto/aes/bsaes-armv7.S | 4 +- .../linux-arm/crypto/bn/armv4-mont.S | 6 +- .../linux-arm/crypto/chacha/chacha-armv4.S | 1471 +++++++++++ .../linux-arm/crypto/modes/ghash-armv4.S | 7 +- .../linux-arm/crypto/modes/ghashv8-armx32.S | 29 +- .../linux-arm/crypto/sha/sha1-armv4-large.S | 5 +- .../linux-arm/crypto/sha/sha256-armv4.S | 6 +- .../linux-arm/crypto/sha/sha512-armv4.S | 8 +- .../linux-x86/crypto/chacha/chacha-x86.S | 969 +++++++ .../boringssl/linux-x86/crypto/rc4/rc4-586.S | 35 - .../boringssl/linux-x86/crypto/sha/sha1-586.S | 1365 ++++++++-- .../linux-x86/crypto/sha/sha256-586.S | 1394 ++++++++-- .../linux-x86_64/crypto/aes/aes-x86_64.S | 70 +- .../linux-x86_64/crypto/aes/aesni-x86_64.S | 82 +- .../linux-x86_64/crypto/aes/bsaes-x86_64.S | 158 +- .../linux-x86_64/crypto/aes/vpaes-x86_64.S | 20 +- .../linux-x86_64/crypto/bn/rsaz-x86_64.S | 220 +- .../linux-x86_64/crypto/bn/x86_64-mont.S | 112 +- .../linux-x86_64/crypto/bn/x86_64-mont5.S | 918 +++++-- .../crypto/chacha/chacha-x86_64.S | 1585 ++++++++++++ .../linux-x86_64/crypto/ec/p256-x86_64-asm.S | 1789 +++++++++++++ .../linux-x86_64/crypto/md5/md5-x86_64.S | 34 +- .../linux-x86_64/crypto/modes/ghash-x86_64.S | 34 +- .../linux-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++ .../linux-x86_64/crypto/sha/sha256-x86_64.S | 1062 ++++++++ .../linux-x86_64/crypto/sha/sha512-x86_64.S | 2241 ++++++++++++++++ .../mac-x86/crypto/chacha/chacha-x86.S | 969 +++++++ .../boringssl/mac-x86/crypto/rc4/rc4-586.S | 33 - .../boringssl/mac-x86/crypto/sha/sha1-586.S | 1359 ++++++++-- .../boringssl/mac-x86/crypto/sha/sha256-586.S | 1394 ++++++++-- .../mac-x86_64/crypto/aes/aes-x86_64.S | 70 +- .../mac-x86_64/crypto/aes/aesni-x86_64.S | 82 +- .../mac-x86_64/crypto/aes/bsaes-x86_64.S | 158 +- .../mac-x86_64/crypto/aes/vpaes-x86_64.S | 20 +- .../mac-x86_64/crypto/bn/rsaz-x86_64.S | 220 +- .../mac-x86_64/crypto/bn/x86_64-mont.S | 112 +- .../mac-x86_64/crypto/bn/x86_64-mont5.S | 912 +++++-- .../mac-x86_64/crypto/chacha/chacha-x86_64.S | 1584 ++++++++++++ .../mac-x86_64/crypto/ec/p256-x86_64-asm.S | 1788 +++++++++++++ .../mac-x86_64/crypto/md5/md5-x86_64.S | 34 +- .../mac-x86_64/crypto/modes/ghash-x86_64.S | 34 +- .../mac-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++ .../mac-x86_64/crypto/sha/sha256-x86_64.S | 1062 ++++++++ .../mac-x86_64/crypto/sha/sha512-x86_64.S | 2241 ++++++++++++++++ .../third_party/boringssl/roll_boringssl.py | 126 + .../win-x86/crypto/chacha/chacha-x86.asm | 977 +++++++ .../boringssl/win-x86/crypto/rc4/rc4-586.asm | 29 - .../boringssl/win-x86/crypto/sha/sha1-586.asm | 1355 ++++++++-- .../win-x86/crypto/sha/sha256-586.asm | 1394 ++++++++-- .../win-x86_64/crypto/bn/rsaz-x86_64.asm | 293 ++- .../win-x86_64/crypto/bn/x86_64-mont.asm | 112 +- .../win-x86_64/crypto/bn/x86_64-mont5.asm | 948 +++++-- .../crypto/chacha/chacha-x86_64.asm | 1689 ++++++++++++ .../win-x86_64/crypto/ec/p256-x86_64-asm.asm | 1925 ++++++++++++++ .../win-x86_64/crypto/sha/sha1-x86_64.asm | 1152 +++++++++ .../win-x86_64/crypto/sha/sha256-x86_64.asm | 1088 ++++++++ .../win-x86_64/crypto/sha/sha512-x86_64.asm | 2301 +++++++++++++++++ packager/third_party/curl/curl.gyp | 28 +- 80 files changed, 44262 insertions(+), 3647 deletions(-) create mode 100644 packager/third_party/boringssl/BUILD.generated.gni create mode 100644 packager/third_party/boringssl/BUILD.generated_tests.gni create mode 100644 packager/third_party/boringssl/linux-aarch64/crypto/bn/armv8-mont.S create mode 100644 packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S create mode 100644 packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S create mode 100644 packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S create mode 100644 packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S create mode 100644 packager/third_party/boringssl/linux-x86_64/crypto/ec/p256-x86_64-asm.S create mode 100644 packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S create mode 100644 packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S create mode 100644 packager/third_party/boringssl/mac-x86_64/crypto/ec/p256-x86_64-asm.S create mode 100755 packager/third_party/boringssl/roll_boringssl.py create mode 100644 packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm create mode 100644 packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm create mode 100644 packager/third_party/boringssl/win-x86_64/crypto/ec/p256-x86_64-asm.asm diff --git a/DEPS b/DEPS index e803f4066e..7aaea3d4c5 100644 --- a/DEPS +++ b/DEPS @@ -9,9 +9,7 @@ vars = { "chromium_git": "https://chromium.googlesource.com", "googlesource_git": "https://%s.googlesource.com", - "curl_url": "https://github.com/bagder/curl.git", - # TODO(kqyang): Replace with an official release. - "curl_rev": "26ddc536b0ab5fc62d6503c82c34dd3dbf112dc3", + "curl_url": "https://github.com/curl/curl.git", } deps = { @@ -33,10 +31,10 @@ deps = { # Make sure the version matches the one in # src/packager/third_party/boringssl, which contains perl generated files. "src/packager/third_party/boringssl/src": - (Var("googlesource_git") % "boringssl") + "/boringssl@209b2562235f7dab66b8260624e7b3c5b00d14a6", + (Var("googlesource_git") % "boringssl") + "/boringssl@3cab5572b1fcf5a8f6018529dc30dc8d21b2a4bd", "src/packager/third_party/curl/source": - Var("curl_url") + "@" + Var("curl_rev"), + Var("curl_url") + "@curl-7_50_0", "src/packager/third_party/gflags": Var("chromium_git") + "/external/webrtc/trunk/third_party/gflags@cc7e9a4b374ff7b6a1cae4d76161113ea985b624", diff --git a/packager/media/base/test/fake_prng.cc b/packager/media/base/test/fake_prng.cc index c9a284e48d..ab49cbb0e6 100644 --- a/packager/media/base/test/fake_prng.cc +++ b/packager/media/base/test/fake_prng.cc @@ -19,7 +19,7 @@ FILE* g_rand_source_fp = NULL; const char kFakePrngDataFile[] = "fake_prng_data.bin"; // RAND_bytes and RAND_pseudorand implementation. -int FakeBytes(uint8_t* buf, int num) { +int FakeBytes(uint8_t* buf, size_t num) { DCHECK(buf); DCHECK(g_rand_source_fp); diff --git a/packager/third_party/boringssl/BUILD.generated.gni b/packager/third_party/boringssl/BUILD.generated.gni new file mode 100644 index 0000000000..cdbe0c95ab --- /dev/null +++ b/packager/third_party/boringssl/BUILD.generated.gni @@ -0,0 +1,459 @@ +# Copyright (c) 2016 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +# This file is created by generate_build_files.py. Do not edit manually. + +crypto_sources = [ + "err_data.c", + "src/crypto/aes/aes.c", + "src/crypto/aes/mode_wrappers.c", + "src/crypto/asn1/a_bitstr.c", + "src/crypto/asn1/a_bool.c", + "src/crypto/asn1/a_bytes.c", + "src/crypto/asn1/a_d2i_fp.c", + "src/crypto/asn1/a_dup.c", + "src/crypto/asn1/a_enum.c", + "src/crypto/asn1/a_gentm.c", + "src/crypto/asn1/a_i2d_fp.c", + "src/crypto/asn1/a_int.c", + "src/crypto/asn1/a_mbstr.c", + "src/crypto/asn1/a_object.c", + "src/crypto/asn1/a_octet.c", + "src/crypto/asn1/a_print.c", + "src/crypto/asn1/a_strnid.c", + "src/crypto/asn1/a_time.c", + "src/crypto/asn1/a_type.c", + "src/crypto/asn1/a_utctm.c", + "src/crypto/asn1/a_utf8.c", + "src/crypto/asn1/asn1_lib.c", + "src/crypto/asn1/asn1_par.c", + "src/crypto/asn1/asn_pack.c", + "src/crypto/asn1/f_enum.c", + "src/crypto/asn1/f_int.c", + "src/crypto/asn1/f_string.c", + "src/crypto/asn1/t_bitst.c", + "src/crypto/asn1/tasn_dec.c", + "src/crypto/asn1/tasn_enc.c", + "src/crypto/asn1/tasn_fre.c", + "src/crypto/asn1/tasn_new.c", + "src/crypto/asn1/tasn_typ.c", + "src/crypto/asn1/tasn_utl.c", + "src/crypto/asn1/x_bignum.c", + "src/crypto/asn1/x_long.c", + "src/crypto/base64/base64.c", + "src/crypto/bio/bio.c", + "src/crypto/bio/bio_mem.c", + "src/crypto/bio/buffer.c", + "src/crypto/bio/connect.c", + "src/crypto/bio/fd.c", + "src/crypto/bio/file.c", + "src/crypto/bio/hexdump.c", + "src/crypto/bio/pair.c", + "src/crypto/bio/printf.c", + "src/crypto/bio/socket.c", + "src/crypto/bio/socket_helper.c", + "src/crypto/bn/add.c", + "src/crypto/bn/asm/x86_64-gcc.c", + "src/crypto/bn/bn.c", + "src/crypto/bn/bn_asn1.c", + "src/crypto/bn/cmp.c", + "src/crypto/bn/convert.c", + "src/crypto/bn/ctx.c", + "src/crypto/bn/div.c", + "src/crypto/bn/exponentiation.c", + "src/crypto/bn/gcd.c", + "src/crypto/bn/generic.c", + "src/crypto/bn/kronecker.c", + "src/crypto/bn/montgomery.c", + "src/crypto/bn/mul.c", + "src/crypto/bn/prime.c", + "src/crypto/bn/random.c", + "src/crypto/bn/rsaz_exp.c", + "src/crypto/bn/shift.c", + "src/crypto/bn/sqrt.c", + "src/crypto/buf/buf.c", + "src/crypto/bytestring/asn1_compat.c", + "src/crypto/bytestring/ber.c", + "src/crypto/bytestring/cbb.c", + "src/crypto/bytestring/cbs.c", + "src/crypto/chacha/chacha.c", + "src/crypto/cipher/aead.c", + "src/crypto/cipher/cipher.c", + "src/crypto/cipher/derive_key.c", + "src/crypto/cipher/e_aes.c", + "src/crypto/cipher/e_chacha20poly1305.c", + "src/crypto/cipher/e_des.c", + "src/crypto/cipher/e_null.c", + "src/crypto/cipher/e_rc2.c", + "src/crypto/cipher/e_rc4.c", + "src/crypto/cipher/e_ssl3.c", + "src/crypto/cipher/e_tls.c", + "src/crypto/cipher/tls_cbc.c", + "src/crypto/cmac/cmac.c", + "src/crypto/conf/conf.c", + "src/crypto/cpu-aarch64-linux.c", + "src/crypto/cpu-arm-linux.c", + "src/crypto/cpu-arm.c", + "src/crypto/cpu-intel.c", + "src/crypto/crypto.c", + "src/crypto/curve25519/curve25519.c", + "src/crypto/curve25519/spake25519.c", + "src/crypto/curve25519/x25519-x86_64.c", + "src/crypto/des/des.c", + "src/crypto/dh/check.c", + "src/crypto/dh/dh.c", + "src/crypto/dh/dh_asn1.c", + "src/crypto/dh/params.c", + "src/crypto/digest/digest.c", + "src/crypto/digest/digests.c", + "src/crypto/dsa/dsa.c", + "src/crypto/dsa/dsa_asn1.c", + "src/crypto/ec/ec.c", + "src/crypto/ec/ec_asn1.c", + "src/crypto/ec/ec_key.c", + "src/crypto/ec/ec_montgomery.c", + "src/crypto/ec/oct.c", + "src/crypto/ec/p224-64.c", + "src/crypto/ec/p256-64.c", + "src/crypto/ec/p256-x86_64.c", + "src/crypto/ec/simple.c", + "src/crypto/ec/util-64.c", + "src/crypto/ec/wnaf.c", + "src/crypto/ecdh/ecdh.c", + "src/crypto/ecdsa/ecdsa.c", + "src/crypto/ecdsa/ecdsa_asn1.c", + "src/crypto/engine/engine.c", + "src/crypto/err/err.c", + "src/crypto/evp/digestsign.c", + "src/crypto/evp/evp.c", + "src/crypto/evp/evp_asn1.c", + "src/crypto/evp/evp_ctx.c", + "src/crypto/evp/p_dsa_asn1.c", + "src/crypto/evp/p_ec.c", + "src/crypto/evp/p_ec_asn1.c", + "src/crypto/evp/p_rsa.c", + "src/crypto/evp/p_rsa_asn1.c", + "src/crypto/evp/pbkdf.c", + "src/crypto/evp/print.c", + "src/crypto/evp/sign.c", + "src/crypto/ex_data.c", + "src/crypto/hkdf/hkdf.c", + "src/crypto/hmac/hmac.c", + "src/crypto/lhash/lhash.c", + "src/crypto/md4/md4.c", + "src/crypto/md5/md5.c", + "src/crypto/mem.c", + "src/crypto/modes/cbc.c", + "src/crypto/modes/cfb.c", + "src/crypto/modes/ctr.c", + "src/crypto/modes/gcm.c", + "src/crypto/modes/ofb.c", + "src/crypto/newhope/error_correction.c", + "src/crypto/newhope/newhope.c", + "src/crypto/newhope/ntt.c", + "src/crypto/newhope/poly.c", + "src/crypto/newhope/precomp.c", + "src/crypto/newhope/reduce.c", + "src/crypto/obj/obj.c", + "src/crypto/obj/obj_xref.c", + "src/crypto/pem/pem_all.c", + "src/crypto/pem/pem_info.c", + "src/crypto/pem/pem_lib.c", + "src/crypto/pem/pem_oth.c", + "src/crypto/pem/pem_pk8.c", + "src/crypto/pem/pem_pkey.c", + "src/crypto/pem/pem_x509.c", + "src/crypto/pem/pem_xaux.c", + "src/crypto/pkcs8/p5_pbe.c", + "src/crypto/pkcs8/p5_pbev2.c", + "src/crypto/pkcs8/p8_pkey.c", + "src/crypto/pkcs8/pkcs8.c", + "src/crypto/poly1305/poly1305.c", + "src/crypto/poly1305/poly1305_arm.c", + "src/crypto/poly1305/poly1305_vec.c", + "src/crypto/rand/deterministic.c", + "src/crypto/rand/rand.c", + "src/crypto/rand/urandom.c", + "src/crypto/rand/windows.c", + "src/crypto/rc4/rc4.c", + "src/crypto/refcount_c11.c", + "src/crypto/refcount_lock.c", + "src/crypto/rsa/blinding.c", + "src/crypto/rsa/padding.c", + "src/crypto/rsa/rsa.c", + "src/crypto/rsa/rsa_asn1.c", + "src/crypto/rsa/rsa_impl.c", + "src/crypto/sha/sha1.c", + "src/crypto/sha/sha256.c", + "src/crypto/sha/sha512.c", + "src/crypto/stack/stack.c", + "src/crypto/thread.c", + "src/crypto/thread_none.c", + "src/crypto/thread_pthread.c", + "src/crypto/thread_win.c", + "src/crypto/time_support.c", + "src/crypto/x509/a_digest.c", + "src/crypto/x509/a_sign.c", + "src/crypto/x509/a_strex.c", + "src/crypto/x509/a_verify.c", + "src/crypto/x509/algorithm.c", + "src/crypto/x509/asn1_gen.c", + "src/crypto/x509/by_dir.c", + "src/crypto/x509/by_file.c", + "src/crypto/x509/i2d_pr.c", + "src/crypto/x509/pkcs7.c", + "src/crypto/x509/rsa_pss.c", + "src/crypto/x509/t_crl.c", + "src/crypto/x509/t_req.c", + "src/crypto/x509/t_x509.c", + "src/crypto/x509/t_x509a.c", + "src/crypto/x509/x509.c", + "src/crypto/x509/x509_att.c", + "src/crypto/x509/x509_cmp.c", + "src/crypto/x509/x509_d2.c", + "src/crypto/x509/x509_def.c", + "src/crypto/x509/x509_ext.c", + "src/crypto/x509/x509_lu.c", + "src/crypto/x509/x509_obj.c", + "src/crypto/x509/x509_r2x.c", + "src/crypto/x509/x509_req.c", + "src/crypto/x509/x509_set.c", + "src/crypto/x509/x509_trs.c", + "src/crypto/x509/x509_txt.c", + "src/crypto/x509/x509_v3.c", + "src/crypto/x509/x509_vfy.c", + "src/crypto/x509/x509_vpm.c", + "src/crypto/x509/x509cset.c", + "src/crypto/x509/x509name.c", + "src/crypto/x509/x509rset.c", + "src/crypto/x509/x509spki.c", + "src/crypto/x509/x509type.c", + "src/crypto/x509/x_algor.c", + "src/crypto/x509/x_all.c", + "src/crypto/x509/x_attrib.c", + "src/crypto/x509/x_crl.c", + "src/crypto/x509/x_exten.c", + "src/crypto/x509/x_info.c", + "src/crypto/x509/x_name.c", + "src/crypto/x509/x_pkey.c", + "src/crypto/x509/x_pubkey.c", + "src/crypto/x509/x_req.c", + "src/crypto/x509/x_sig.c", + "src/crypto/x509/x_spki.c", + "src/crypto/x509/x_val.c", + "src/crypto/x509/x_x509.c", + "src/crypto/x509/x_x509a.c", + "src/crypto/x509v3/pcy_cache.c", + "src/crypto/x509v3/pcy_data.c", + "src/crypto/x509v3/pcy_lib.c", + "src/crypto/x509v3/pcy_map.c", + "src/crypto/x509v3/pcy_node.c", + "src/crypto/x509v3/pcy_tree.c", + "src/crypto/x509v3/v3_akey.c", + "src/crypto/x509v3/v3_akeya.c", + "src/crypto/x509v3/v3_alt.c", + "src/crypto/x509v3/v3_bcons.c", + "src/crypto/x509v3/v3_bitst.c", + "src/crypto/x509v3/v3_conf.c", + "src/crypto/x509v3/v3_cpols.c", + "src/crypto/x509v3/v3_crld.c", + "src/crypto/x509v3/v3_enum.c", + "src/crypto/x509v3/v3_extku.c", + "src/crypto/x509v3/v3_genn.c", + "src/crypto/x509v3/v3_ia5.c", + "src/crypto/x509v3/v3_info.c", + "src/crypto/x509v3/v3_int.c", + "src/crypto/x509v3/v3_lib.c", + "src/crypto/x509v3/v3_ncons.c", + "src/crypto/x509v3/v3_pci.c", + "src/crypto/x509v3/v3_pcia.c", + "src/crypto/x509v3/v3_pcons.c", + "src/crypto/x509v3/v3_pku.c", + "src/crypto/x509v3/v3_pmaps.c", + "src/crypto/x509v3/v3_prn.c", + "src/crypto/x509v3/v3_purp.c", + "src/crypto/x509v3/v3_skey.c", + "src/crypto/x509v3/v3_sxnet.c", + "src/crypto/x509v3/v3_utl.c", +] + +ssl_sources = [ + "src/ssl/custom_extensions.c", + "src/ssl/d1_both.c", + "src/ssl/d1_lib.c", + "src/ssl/d1_meth.c", + "src/ssl/d1_pkt.c", + "src/ssl/d1_srtp.c", + "src/ssl/dtls_record.c", + "src/ssl/handshake_client.c", + "src/ssl/handshake_server.c", + "src/ssl/pqueue/pqueue.c", + "src/ssl/s3_both.c", + "src/ssl/s3_enc.c", + "src/ssl/s3_lib.c", + "src/ssl/s3_meth.c", + "src/ssl/s3_pkt.c", + "src/ssl/ssl_aead_ctx.c", + "src/ssl/ssl_asn1.c", + "src/ssl/ssl_buffer.c", + "src/ssl/ssl_cert.c", + "src/ssl/ssl_cipher.c", + "src/ssl/ssl_ecdh.c", + "src/ssl/ssl_file.c", + "src/ssl/ssl_lib.c", + "src/ssl/ssl_rsa.c", + "src/ssl/ssl_session.c", + "src/ssl/ssl_stat.c", + "src/ssl/t1_enc.c", + "src/ssl/t1_lib.c", + "src/ssl/tls_record.c", +] + +crypto_sources_linux_aarch64 = [ + "linux-aarch64/crypto/aes/aesv8-armx64.S", + "linux-aarch64/crypto/bn/armv8-mont.S", + "linux-aarch64/crypto/chacha/chacha-armv8.S", + "linux-aarch64/crypto/modes/ghashv8-armx64.S", + "linux-aarch64/crypto/sha/sha1-armv8.S", + "linux-aarch64/crypto/sha/sha256-armv8.S", + "linux-aarch64/crypto/sha/sha512-armv8.S", +] + +crypto_sources_linux_arm = [ + "linux-arm/crypto/aes/aes-armv4.S", + "linux-arm/crypto/aes/aesv8-armx32.S", + "linux-arm/crypto/aes/bsaes-armv7.S", + "linux-arm/crypto/bn/armv4-mont.S", + "linux-arm/crypto/chacha/chacha-armv4.S", + "linux-arm/crypto/modes/ghash-armv4.S", + "linux-arm/crypto/modes/ghashv8-armx32.S", + "linux-arm/crypto/sha/sha1-armv4-large.S", + "linux-arm/crypto/sha/sha256-armv4.S", + "linux-arm/crypto/sha/sha512-armv4.S", + "src/crypto/curve25519/asm/x25519-asm-arm.S", + "src/crypto/poly1305/poly1305_arm_asm.S", +] + +crypto_sources_linux_x86 = [ + "linux-x86/crypto/aes/aes-586.S", + "linux-x86/crypto/aes/aesni-x86.S", + "linux-x86/crypto/aes/vpaes-x86.S", + "linux-x86/crypto/bn/bn-586.S", + "linux-x86/crypto/bn/co-586.S", + "linux-x86/crypto/bn/x86-mont.S", + "linux-x86/crypto/chacha/chacha-x86.S", + "linux-x86/crypto/md5/md5-586.S", + "linux-x86/crypto/modes/ghash-x86.S", + "linux-x86/crypto/rc4/rc4-586.S", + "linux-x86/crypto/sha/sha1-586.S", + "linux-x86/crypto/sha/sha256-586.S", + "linux-x86/crypto/sha/sha512-586.S", +] + +crypto_sources_linux_x86_64 = [ + "linux-x86_64/crypto/aes/aes-x86_64.S", + "linux-x86_64/crypto/aes/aesni-x86_64.S", + "linux-x86_64/crypto/aes/bsaes-x86_64.S", + "linux-x86_64/crypto/aes/vpaes-x86_64.S", + "linux-x86_64/crypto/bn/rsaz-avx2.S", + "linux-x86_64/crypto/bn/rsaz-x86_64.S", + "linux-x86_64/crypto/bn/x86_64-mont.S", + "linux-x86_64/crypto/bn/x86_64-mont5.S", + "linux-x86_64/crypto/chacha/chacha-x86_64.S", + "linux-x86_64/crypto/ec/p256-x86_64-asm.S", + "linux-x86_64/crypto/md5/md5-x86_64.S", + "linux-x86_64/crypto/modes/aesni-gcm-x86_64.S", + "linux-x86_64/crypto/modes/ghash-x86_64.S", + "linux-x86_64/crypto/rand/rdrand-x86_64.S", + "linux-x86_64/crypto/rc4/rc4-x86_64.S", + "linux-x86_64/crypto/sha/sha1-x86_64.S", + "linux-x86_64/crypto/sha/sha256-x86_64.S", + "linux-x86_64/crypto/sha/sha512-x86_64.S", + "src/crypto/curve25519/asm/x25519-asm-x86_64.S", +] + +crypto_sources_mac_x86 = [ + "mac-x86/crypto/aes/aes-586.S", + "mac-x86/crypto/aes/aesni-x86.S", + "mac-x86/crypto/aes/vpaes-x86.S", + "mac-x86/crypto/bn/bn-586.S", + "mac-x86/crypto/bn/co-586.S", + "mac-x86/crypto/bn/x86-mont.S", + "mac-x86/crypto/chacha/chacha-x86.S", + "mac-x86/crypto/md5/md5-586.S", + "mac-x86/crypto/modes/ghash-x86.S", + "mac-x86/crypto/rc4/rc4-586.S", + "mac-x86/crypto/sha/sha1-586.S", + "mac-x86/crypto/sha/sha256-586.S", + "mac-x86/crypto/sha/sha512-586.S", +] + +crypto_sources_mac_x86_64 = [ + "mac-x86_64/crypto/aes/aes-x86_64.S", + "mac-x86_64/crypto/aes/aesni-x86_64.S", + "mac-x86_64/crypto/aes/bsaes-x86_64.S", + "mac-x86_64/crypto/aes/vpaes-x86_64.S", + "mac-x86_64/crypto/bn/rsaz-avx2.S", + "mac-x86_64/crypto/bn/rsaz-x86_64.S", + "mac-x86_64/crypto/bn/x86_64-mont.S", + "mac-x86_64/crypto/bn/x86_64-mont5.S", + "mac-x86_64/crypto/chacha/chacha-x86_64.S", + "mac-x86_64/crypto/ec/p256-x86_64-asm.S", + "mac-x86_64/crypto/md5/md5-x86_64.S", + "mac-x86_64/crypto/modes/aesni-gcm-x86_64.S", + "mac-x86_64/crypto/modes/ghash-x86_64.S", + "mac-x86_64/crypto/rand/rdrand-x86_64.S", + "mac-x86_64/crypto/rc4/rc4-x86_64.S", + "mac-x86_64/crypto/sha/sha1-x86_64.S", + "mac-x86_64/crypto/sha/sha256-x86_64.S", + "mac-x86_64/crypto/sha/sha512-x86_64.S", + "src/crypto/curve25519/asm/x25519-asm-x86_64.S", +] + +crypto_sources_win_x86 = [ + "win-x86/crypto/aes/aes-586.asm", + "win-x86/crypto/aes/aesni-x86.asm", + "win-x86/crypto/aes/vpaes-x86.asm", + "win-x86/crypto/bn/bn-586.asm", + "win-x86/crypto/bn/co-586.asm", + "win-x86/crypto/bn/x86-mont.asm", + "win-x86/crypto/chacha/chacha-x86.asm", + "win-x86/crypto/md5/md5-586.asm", + "win-x86/crypto/modes/ghash-x86.asm", + "win-x86/crypto/rc4/rc4-586.asm", + "win-x86/crypto/sha/sha1-586.asm", + "win-x86/crypto/sha/sha256-586.asm", + "win-x86/crypto/sha/sha512-586.asm", +] + +crypto_sources_win_x86_64 = [ + "win-x86_64/crypto/aes/aes-x86_64.asm", + "win-x86_64/crypto/aes/aesni-x86_64.asm", + "win-x86_64/crypto/aes/bsaes-x86_64.asm", + "win-x86_64/crypto/aes/vpaes-x86_64.asm", + "win-x86_64/crypto/bn/rsaz-avx2.asm", + "win-x86_64/crypto/bn/rsaz-x86_64.asm", + "win-x86_64/crypto/bn/x86_64-mont.asm", + "win-x86_64/crypto/bn/x86_64-mont5.asm", + "win-x86_64/crypto/chacha/chacha-x86_64.asm", + "win-x86_64/crypto/ec/p256-x86_64-asm.asm", + "win-x86_64/crypto/md5/md5-x86_64.asm", + "win-x86_64/crypto/modes/aesni-gcm-x86_64.asm", + "win-x86_64/crypto/modes/ghash-x86_64.asm", + "win-x86_64/crypto/rand/rdrand-x86_64.asm", + "win-x86_64/crypto/rc4/rc4-x86_64.asm", + "win-x86_64/crypto/sha/sha1-x86_64.asm", + "win-x86_64/crypto/sha/sha256-x86_64.asm", + "win-x86_64/crypto/sha/sha512-x86_64.asm", +] + +fuzzers = [ + "cert", + "client", + "pkcs8", + "privkey", + "read_pem", + "server", + "spki", +] diff --git a/packager/third_party/boringssl/BUILD.generated_tests.gni b/packager/third_party/boringssl/BUILD.generated_tests.gni new file mode 100644 index 0000000000..8b7ea3cc59 --- /dev/null +++ b/packager/third_party/boringssl/BUILD.generated_tests.gni @@ -0,0 +1,597 @@ +# Copyright (c) 2016 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +# This file is created by generate_build_files.py. Do not edit manually. + +_test_support_sources = [ + "src/crypto/test/file_test.cc", + "src/crypto/test/file_test.h", + "src/crypto/test/malloc.cc", + "src/crypto/test/scoped_types.h", + "src/crypto/test/test_util.cc", + "src/crypto/test/test_util.h", + "src/ssl/test/async_bio.h", + "src/ssl/test/packeted_bio.h", + "src/ssl/test/scoped_types.h", + "src/ssl/test/test_config.h", +] + +template("create_tests") { + executable("boringssl_aes_test") { + sources = [ + "src/crypto/aes/aes_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_asn1_test") { + sources = [ + "src/crypto/asn1/asn1_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_base64_test") { + sources = [ + "src/crypto/base64/base64_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_bio_test") { + sources = [ + "src/crypto/bio/bio_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_bn_test") { + sources = [ + "src/crypto/bn/bn_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_bytestring_test") { + sources = [ + "src/crypto/bytestring/bytestring_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_chacha_test") { + sources = [ + "src/crypto/chacha/chacha_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_aead_test") { + sources = [ + "src/crypto/cipher/aead_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_cipher_test") { + sources = [ + "src/crypto/cipher/cipher_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_cmac_test") { + sources = [ + "src/crypto/cmac/cmac_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_constant_time_test") { + sources = [ + "src/crypto/constant_time_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_ed25519_test") { + sources = [ + "src/crypto/curve25519/ed25519_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_spake25519_test") { + sources = [ + "src/crypto/curve25519/spake25519_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_x25519_test") { + sources = [ + "src/crypto/curve25519/x25519_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_dh_test") { + sources = [ + "src/crypto/dh/dh_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_digest_test") { + sources = [ + "src/crypto/digest/digest_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_dsa_test") { + sources = [ + "src/crypto/dsa/dsa_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_ec_test") { + sources = [ + "src/crypto/ec/ec_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_example_mul") { + sources = [ + "src/crypto/ec/example_mul.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_ecdsa_test") { + sources = [ + "src/crypto/ecdsa/ecdsa_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_err_test") { + sources = [ + "src/crypto/err/err_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_evp_extra_test") { + sources = [ + "src/crypto/evp/evp_extra_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_evp_test") { + sources = [ + "src/crypto/evp/evp_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_pbkdf_test") { + sources = [ + "src/crypto/evp/pbkdf_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_hkdf_test") { + sources = [ + "src/crypto/hkdf/hkdf_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_hmac_test") { + sources = [ + "src/crypto/hmac/hmac_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_lhash_test") { + sources = [ + "src/crypto/lhash/lhash_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_gcm_test") { + sources = [ + "src/crypto/modes/gcm_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_newhope_statistical_test") { + sources = [ + "src/crypto/newhope/newhope_statistical_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_newhope_test") { + sources = [ + "src/crypto/newhope/newhope_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_newhope_vectors_test") { + sources = [ + "src/crypto/newhope/newhope_vectors_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_obj_test") { + sources = [ + "src/crypto/obj/obj_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_pkcs12_test") { + sources = [ + "src/crypto/pkcs8/pkcs12_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_pkcs8_test") { + sources = [ + "src/crypto/pkcs8/pkcs8_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_poly1305_test") { + sources = [ + "src/crypto/poly1305/poly1305_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_refcount_test") { + sources = [ + "src/crypto/refcount_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_rsa_test") { + sources = [ + "src/crypto/rsa/rsa_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_thread_test") { + sources = [ + "src/crypto/thread_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_pkcs7_test") { + sources = [ + "src/crypto/x509/pkcs7_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_x509_test") { + sources = [ + "src/crypto/x509/x509_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_tab_test") { + sources = [ + "src/crypto/x509v3/tab_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_v3name_test") { + sources = [ + "src/crypto/x509v3/v3name_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_pqueue_test") { + sources = [ + "src/ssl/pqueue/pqueue_test.c", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + executable("boringssl_ssl_test") { + sources = [ + "src/ssl/ssl_test.cc", + ] + sources += _test_support_sources + if (defined(invoker.configs_exclude)) { + configs -= invoker.configs_exclude + } + configs += invoker.configs + deps = invoker.deps + } + + group(target_name) { + deps = [ + ":boringssl_aead_test", + ":boringssl_aes_test", + ":boringssl_asn1_test", + ":boringssl_base64_test", + ":boringssl_bio_test", + ":boringssl_bn_test", + ":boringssl_bytestring_test", + ":boringssl_chacha_test", + ":boringssl_cipher_test", + ":boringssl_cmac_test", + ":boringssl_constant_time_test", + ":boringssl_dh_test", + ":boringssl_digest_test", + ":boringssl_dsa_test", + ":boringssl_ec_test", + ":boringssl_ecdsa_test", + ":boringssl_ed25519_test", + ":boringssl_err_test", + ":boringssl_evp_extra_test", + ":boringssl_evp_test", + ":boringssl_example_mul", + ":boringssl_gcm_test", + ":boringssl_hkdf_test", + ":boringssl_hmac_test", + ":boringssl_lhash_test", + ":boringssl_newhope_statistical_test", + ":boringssl_newhope_test", + ":boringssl_newhope_vectors_test", + ":boringssl_obj_test", + ":boringssl_pbkdf_test", + ":boringssl_pkcs12_test", + ":boringssl_pkcs7_test", + ":boringssl_pkcs8_test", + ":boringssl_poly1305_test", + ":boringssl_pqueue_test", + ":boringssl_refcount_test", + ":boringssl_rsa_test", + ":boringssl_spake25519_test", + ":boringssl_ssl_test", + ":boringssl_tab_test", + ":boringssl_thread_test", + ":boringssl_v3name_test", + ":boringssl_x25519_test", + ":boringssl_x509_test", + ] + } +} diff --git a/packager/third_party/boringssl/BUILD.gn b/packager/third_party/boringssl/BUILD.gn index a8fe16edb9..fd7c960989 100644 --- a/packager/third_party/boringssl/BUILD.gn +++ b/packager/third_party/boringssl/BUILD.gn @@ -2,26 +2,51 @@ # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. +import("//build/config/android/config.gni") +import("//build/config/sanitizers/sanitizers.gni") +import("//build_overrides/build.gni") +import("//testing/libfuzzer/fuzzer_test.gni") +import("BUILD.generated.gni") +import("BUILD.generated_tests.gni") + # Config for us and everybody else depending on BoringSSL. -config("openssl_config") { - include_dirs = [] - include_dirs += [ "src/include" ] +config("external_config") { + include_dirs = [ "src/include" ] if (is_component_build) { defines = [ "BORINGSSL_SHARED_LIBRARY" ] } } -# Config internal to this build file. -config("openssl_internal_config") { +# Config internal to this build file, shared by boringssl and boringssl_fuzzer. +config("internal_config") { visibility = [ ":*" ] # Only targets in this file can depend on this. + defines = [ + "BORINGSSL_IMPLEMENTATION", + "BORINGSSL_NO_STATIC_INITIALIZER", + "OPENSSL_SMALL", + ] + configs = [ + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + "//build/config/compiler:no_size_t_to_int_warning", + ] + if (is_posix) { + cflags_c = [ "-std=c99" ] + defines += [ "_XOPEN_SOURCE=700" ] + } } -# The list of BoringSSL files is kept in boringssl.gypi. -gypi_values = - exec_script("//build/gypi_to_gn.py", - [ rebase_path("//third_party/boringssl/boringssl.gypi") ], - "scope", - [ "//third_party/boringssl/boringssl.gypi" ]) +config("no_asm_config") { + visibility = [ ":*" ] # Only targets in this file can depend on this. + defines = [ "OPENSSL_NO_ASM" ] +} + +config("fuzzer_config") { + visibility = [ ":*" ] # Only targets in this file can depend on this. + defines = [ "BORINGSSL_UNSAFE_FUZZER_MODE" ] +} + +all_sources = crypto_sources + ssl_sources # Windows' assembly is built with Yasm. The other platforms use the platform # assembler. @@ -29,75 +54,159 @@ if (is_win && !is_msan) { import("//third_party/yasm/yasm_assemble.gni") yasm_assemble("boringssl_asm") { if (current_cpu == "x64") { - sources = gypi_values.boringssl_win_x86_64_sources + sources = crypto_sources_win_x86_64 } else if (current_cpu == "x86") { - sources = gypi_values.boringssl_win_x86_sources + sources = crypto_sources_win_x86 + } + } +} else { + source_set("boringssl_asm") { + visibility = [ ":*" ] # Only targets in this file can depend on this. + + defines = [] + sources = [] + asmflags = [] + include_dirs = [ "src/include" ] + + if ((current_cpu == "arm" || current_cpu == "arm64") && is_clang) { + if (current_cpu == "arm") { + # TODO(hans) Enable integrated-as (crbug.com/124610). + asmflags += [ "-fno-integrated-as" ] + } + if (is_android) { + rebased_android_toolchain_root = + rebase_path(android_toolchain_root, root_build_dir) + + # Else /usr/bin/as gets picked up. + asmflags += [ "-B${rebased_android_toolchain_root}/bin" ] + } + } + + if (is_msan) { + public_configs = [ ":no_asm_config" ] + } else if (current_cpu == "x64") { + if (is_mac) { + sources += crypto_sources_mac_x86_64 + } else if (is_linux || is_android) { + sources += crypto_sources_linux_x86_64 + } else { + public_configs = [ ":no_asm_config" ] + } + } else if (current_cpu == "x86") { + if (is_mac) { + sources += crypto_sources_mac_x86 + } else if (is_linux || is_android) { + sources += crypto_sources_linux_x86 + } else { + public_configs = [ ":no_asm_config" ] + } + } else if (current_cpu == "arm" && (is_linux || is_android)) { + sources += crypto_sources_linux_arm + } else if (current_cpu == "arm64" && (is_linux || is_android)) { + sources += crypto_sources_linux_aarch64 + + # TODO(davidben): Remove explicit arch flag once https://crbug.com/576858 + # is fixed. + asmflags += [ "-march=armv8-a+crypto" ] + } else { + public_configs = [ ":no_asm_config" ] } } } component("boringssl") { - sources = gypi_values.boringssl_crypto_sources - sources += gypi_values.boringssl_ssl_sources - - public_configs = [ ":openssl_config" ] - - cflags = [] - defines = [ - "BORINGSSL_IMPLEMENTATION", - "BORINGSSL_NO_STATIC_INITIALIZER", + sources = all_sources + deps = [ + ":boringssl_asm", ] - deps = [] - if (is_component_build) { - defines += [ "BORINGSSL_SHARED_LIBRARY" ] - } + + public_configs = [ ":external_config" ] + configs += [ ":internal_config" ] configs -= [ "//build/config/compiler:chromium_code" ] - configs += [ - "//build/config/compiler:no_chromium_code", + configs += [ "//build/config/compiler:no_chromium_code" ] - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - "//build/config/compiler:no_size_t_to_int_warning", - ] - - # Also gets the include dirs from :openssl_config - include_dirs = [ - "src/include", - - # This is for arm_arch.h, which is needed by some asm files. Since the - # asm files are generated and kept in a different directory, they - # cannot use relative paths to find this file. - "src/crypto", - ] - - if (is_msan) { - defines += [ "OPENSSL_NO_ASM" ] - } else if (current_cpu == "x64") { - if (is_mac || is_ios) { - sources += gypi_values.boringssl_mac_x86_64_sources - } else if (is_linux || is_android) { - sources += gypi_values.boringssl_linux_x86_64_sources - } else if (is_win) { - deps += [ ":boringssl_asm" ] - } else { - defines += [ "OPENSSL_NO_ASM" ] - } - } else if (current_cpu == "x86") { - if (is_mac || is_ios) { - sources += gypi_values.boringssl_mac_x86_sources - } else if (is_linux || is_android) { - sources += gypi_values.boringssl_linux_x86_sources - } else if (is_win) { - deps += [ ":boringssl_asm" ] - } else { - defines += [ "OPENSSL_NO_ASM" ] - } - } else if (current_cpu == "arm" && (is_linux || is_android)) { - sources += gypi_values.boringssl_linux_arm_sources - } else if (current_cpu == "arm64" && (is_linux || is_android)) { - sources += gypi_values.boringssl_linux_aarch64_sources - } else { - defines += [ "OPENSSL_NO_ASM" ] + if (is_nacl) { + deps += [ "//native_client_sdk/src/libraries/nacl_io" ] + } +} + +if (build_with_chromium) { + create_tests("boringssl_tests") { + configs_exclude = [ "//build/config/compiler:chromium_code" ] + configs = [ + ":internal_config", + "//build/config/compiler:no_chromium_code", + ] + deps = [ + ":boringssl", + "//build/win:default_exe_manifest", + ] + } + + if (!is_ios) { + test("boringssl_unittests") { + deps = [ + ":boringssl_tests", + "//base", + "//base/test:run_all_unittests", + "//base/test:test_support", + "//testing/gtest", + ] + sources = [ + "boringssl_unittest.cc", + ] + } + } + + # The same as boringssl, but builds with BORINGSSL_UNSAFE_FUZZER_MODE. + component("boringssl_fuzzer") { + visibility = [ ":*" ] # Only targets in this file can depend on this. + + sources = all_sources + deps = [ + ":boringssl_asm", + ] + + public_configs = [ + ":external_config", + ":fuzzer_config", + ] + configs += [ ":internal_config" ] + + configs -= [ "//build/config/compiler:chromium_code" ] + configs += [ "//build/config/compiler:no_chromium_code" ] + + if (is_nacl) { + deps += [ "//native_client_sdk/src/libraries/nacl_io" ] + } + } + + foreach(fuzzer, fuzzers) { + fuzzer_test("boringssl_${fuzzer}_fuzzer") { + sources = [ + "src/fuzz/${fuzzer}.cc", + ] + deps = [ + ":boringssl_fuzzer", + ] + seed_corpus = "src/fuzz/${fuzzer}_corpus" + + if ("cert" == fuzzer) { + libfuzzer_options = [ "max_len=3072" ] + } else if ("client" == fuzzer) { + libfuzzer_options = [ "max_len=20000" ] + } else if ("pkcs8" == fuzzer) { + libfuzzer_options = [ "max_len=2048" ] + } else if ("privkey" == fuzzer) { + libfuzzer_options = [ "max_len=2048" ] + } else if ("read_pem" == fuzzer) { + libfuzzer_options = [ "max_len=512" ] + } else if ("server" == fuzzer) { + libfuzzer_options = [ "max_len=4096" ] + } else if ("spki" == fuzzer) { + libfuzzer_options = [ "max_len=1024" ] + } + } } } diff --git a/packager/third_party/boringssl/README.chromium b/packager/third_party/boringssl/README.chromium index 8c89faa633..8abcfe695d 100644 --- a/packager/third_party/boringssl/README.chromium +++ b/packager/third_party/boringssl/README.chromium @@ -2,7 +2,7 @@ Name: boringssl URL: https://boringssl.googlesource.com/boringssl Version: git License: BSDish -License File: NOTICE +License File: src/LICENSE License Android Compatible: yes Security Critical: yes @@ -13,6 +13,7 @@ https://www.imperialviolet.org/2014/06/20/boringssl.html Note: when rolling DEPS forward, remember to run cd third_party/boringssl - python src/util/generate_build_files.py chromium + python src/util/generate_build_files.py gn gyp -from a system with both Perl and Go installed. +from a system with both Perl and Go installed. Alternatively, use the +roll_boringssl.py script. diff --git a/packager/third_party/boringssl/boringssl.gyp b/packager/third_party/boringssl/boringssl.gyp index 5118fc7cc2..75d0829685 100644 --- a/packager/third_party/boringssl/boringssl.gyp +++ b/packager/third_party/boringssl/boringssl.gyp @@ -3,21 +3,35 @@ # found in the LICENSE file. { + 'includes': [ + 'boringssl.gypi', + ], + 'target_defaults': { + 'conditions': [ + ['os_posix == 1', { + 'cflags_c': [ '-std=c99' ], + 'defines': [ '_XOPEN_SOURCE=700' ], + }], + ], + }, 'targets': [ { - 'target_name': 'boringssl', + 'target_name': 'boringssl_nacl_win64', 'type': '<(component)', - 'includes': [ - 'boringssl.gypi', - ], 'sources': [ '<@(boringssl_crypto_sources)', - '<@(boringssl_ssl_sources)', ], 'defines': [ 'BORINGSSL_IMPLEMENTATION', 'BORINGSSL_NO_STATIC_INITIALIZER', + 'OPENSSL_NO_ASM', + 'OPENSSL_SMALL', ], + 'configurations': { + 'Common_Base': { + 'msvs_target_platform': 'x64', + }, + }, # TODO(davidben): Fix size_t truncations in BoringSSL. # https://crbug.com/429039 'msvs_disabled_warnings': [ 4267, ], @@ -27,12 +41,88 @@ 'BORINGSSL_SHARED_LIBRARY', ], }], + ], + 'include_dirs': [ + 'src/include', + ], + 'direct_dependent_settings': { + 'include_dirs': [ + 'src/include', + ], + 'conditions': [ + ['component == "shared_library"', { + 'defines': [ + 'BORINGSSL_SHARED_LIBRARY', + ], + }], + ], + }, + }, + { + 'target_name': 'boringssl', + 'type': '<(component)', + 'sources': [ + '<@(boringssl_crypto_sources)', + '<@(boringssl_ssl_sources)', + ], + 'defines': [ + 'BORINGSSL_IMPLEMENTATION', + 'BORINGSSL_NO_STATIC_INITIALIZER', + 'OPENSSL_SMALL', + ], + 'dependencies': [ 'boringssl_asm' ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + 'conditions': [ + ['component == "shared_library"', { + 'defines': [ + 'BORINGSSL_SHARED_LIBRARY', + ], + }], + ], + 'include_dirs': [ + 'src/include', + ], + 'direct_dependent_settings': { + 'include_dirs': [ + 'src/include', + ], + 'conditions': [ + ['component == "shared_library"', { + 'defines': [ + 'BORINGSSL_SHARED_LIBRARY', + ], + }], + ], + }, + }, + { + # boringssl_asm is a separate target to allow for ASM-specific cflags. + 'target_name': 'boringssl_asm', + 'type': 'static_library', + 'include_dirs': [ + 'src/include', + ], + 'conditions': [ ['target_arch == "arm" and msan == 0', { 'conditions': [ ['OS == "linux" or OS == "android"', { 'sources': [ '<@(boringssl_linux_arm_sources)' ], }, { - 'defines': [ 'OPENSSL_NO_ASM' ], + 'direct_dependent_settings': { + 'defines': [ 'OPENSSL_NO_ASM' ], + }, + }], + ], + }], + ['target_arch == "arm" and clang == 1', { + # TODO(hans) Enable integrated-as (crbug.com/124610). + 'cflags': [ '-fno-integrated-as' ], + 'conditions': [ + ['OS == "android"', { + # Else /usr/bin/as gets picked up. + 'cflags': [ '-B<(android_toolchain)' ], }], ], }], @@ -40,14 +130,19 @@ 'conditions': [ ['OS == "linux" or OS == "android"', { 'sources': [ '<@(boringssl_linux_aarch64_sources)' ], + # TODO(davidben): Remove explicit arch flag once + # https://crbug.com/576858 is fixed. + 'cflags': [ '-march=armv8-a+crypto' ], }, { - 'defines': [ 'OPENSSL_NO_ASM' ], + 'direct_dependent_settings': { + 'defines': [ 'OPENSSL_NO_ASM' ], + }, }], ], }], ['target_arch == "ia32" and msan == 0', { 'conditions': [ - ['OS == "mac" or OS == "ios"', { + ['OS == "mac"', { 'sources': [ '<@(boringssl_mac_x86_sources)' ], }], ['OS == "linux" or OS == "android"', { @@ -64,14 +159,16 @@ '../yasm/yasm_compile.gypi', ], }], - ['OS != "mac" and OS != "ios" and OS != "linux" and OS != "win" and OS != "android"', { - 'defines': [ 'OPENSSL_NO_ASM' ], + ['OS != "mac" and OS != "linux" and OS != "win" and OS != "android"', { + 'direct_dependent_settings': { + 'defines': [ 'OPENSSL_NO_ASM' ], + }, }], ] }], ['target_arch == "x64" and msan == 0', { 'conditions': [ - ['OS == "mac" or OS == "ios"', { + ['OS == "mac"', { 'sources': [ '<@(boringssl_mac_x86_64_sources)' ], }], ['OS == "linux" or OS == "android"', { @@ -88,34 +185,19 @@ '../yasm/yasm_compile.gypi', ], }], - ['OS != "mac" and OS != "ios" and OS != "linux" and OS != "win" and OS != "android"', { - 'defines': [ 'OPENSSL_NO_ASM' ], + ['OS != "mac" and OS != "linux" and OS != "win" and OS != "android"', { + 'direct_dependent_settings': { + 'defines': [ 'OPENSSL_NO_ASM' ], + }, }], ] }], ['msan == 1 or (target_arch != "arm" and target_arch != "ia32" and target_arch != "x64" and target_arch != "arm64")', { - 'defines': [ 'OPENSSL_NO_ASM' ], + 'direct_dependent_settings': { + 'defines': [ 'OPENSSL_NO_ASM' ], + }, }], ], - 'include_dirs': [ - 'src/include', - # This is for arm_arch.h, which is needed by some asm files. Since the - # asm files are generated and kept in a different directory, they - # cannot use relative paths to find this file. - 'src/crypto', - ], - 'direct_dependent_settings': { - 'include_dirs': [ - 'src/include', - ], - 'conditions': [ - ['component == "shared_library"', { - 'defines': [ - 'BORINGSSL_SHARED_LIBRARY', - ], - }], - ], - }, }, ], } diff --git a/packager/third_party/boringssl/boringssl.gypi b/packager/third_party/boringssl/boringssl.gypi index 1aaade461c..65a483e90e 100644 --- a/packager/third_party/boringssl/boringssl.gypi +++ b/packager/third_party/boringssl/boringssl.gypi @@ -1,4 +1,4 @@ -# Copyright (c) 2014 The Chromium Authors. All rights reserved. +# Copyright (c) 2016 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. @@ -7,33 +7,35 @@ { 'variables': { 'boringssl_ssl_sources': [ + 'src/ssl/custom_extensions.c', 'src/ssl/d1_both.c', - 'src/ssl/d1_clnt.c', 'src/ssl/d1_lib.c', 'src/ssl/d1_meth.c', 'src/ssl/d1_pkt.c', 'src/ssl/d1_srtp.c', - 'src/ssl/d1_srvr.c', + 'src/ssl/dtls_record.c', + 'src/ssl/handshake_client.c', + 'src/ssl/handshake_server.c', 'src/ssl/pqueue/pqueue.c', 'src/ssl/s3_both.c', - 'src/ssl/s3_clnt.c', 'src/ssl/s3_enc.c', 'src/ssl/s3_lib.c', 'src/ssl/s3_meth.c', 'src/ssl/s3_pkt.c', - 'src/ssl/s3_srvr.c', 'src/ssl/ssl_aead_ctx.c', - 'src/ssl/ssl_algs.c', 'src/ssl/ssl_asn1.c', + 'src/ssl/ssl_buffer.c', 'src/ssl/ssl_cert.c', 'src/ssl/ssl_cipher.c', + 'src/ssl/ssl_ecdh.c', + 'src/ssl/ssl_file.c', 'src/ssl/ssl_lib.c', 'src/ssl/ssl_rsa.c', - 'src/ssl/ssl_sess.c', + 'src/ssl/ssl_session.c', 'src/ssl/ssl_stat.c', - 'src/ssl/ssl_txt.c', 'src/ssl/t1_enc.c', 'src/ssl/t1_lib.c', + 'src/ssl/tls_record.c', ], 'boringssl_crypto_sources': [ 'err_data.c', @@ -60,18 +62,14 @@ 'src/crypto/asn1/asn1_lib.c', 'src/crypto/asn1/asn1_par.c', 'src/crypto/asn1/asn_pack.c', - 'src/crypto/asn1/bio_asn1.c', - 'src/crypto/asn1/bio_ndef.c', 'src/crypto/asn1/f_enum.c', 'src/crypto/asn1/f_int.c', 'src/crypto/asn1/f_string.c', 'src/crypto/asn1/t_bitst.c', - 'src/crypto/asn1/t_pkey.c', 'src/crypto/asn1/tasn_dec.c', 'src/crypto/asn1/tasn_enc.c', 'src/crypto/asn1/tasn_fre.c', 'src/crypto/asn1/tasn_new.c', - 'src/crypto/asn1/tasn_prn.c', 'src/crypto/asn1/tasn_typ.c', 'src/crypto/asn1/tasn_utl.c', 'src/crypto/asn1/x_bignum.c', @@ -108,11 +106,11 @@ 'src/crypto/bn/shift.c', 'src/crypto/bn/sqrt.c', 'src/crypto/buf/buf.c', + 'src/crypto/bytestring/asn1_compat.c', 'src/crypto/bytestring/ber.c', 'src/crypto/bytestring/cbb.c', 'src/crypto/bytestring/cbs.c', - 'src/crypto/chacha/chacha_generic.c', - 'src/crypto/chacha/chacha_vec.c', + 'src/crypto/chacha/chacha.c', 'src/crypto/cipher/aead.c', 'src/crypto/cipher/cipher.c', 'src/crypto/cipher/derive_key.c', @@ -127,28 +125,31 @@ 'src/crypto/cipher/tls_cbc.c', 'src/crypto/cmac/cmac.c', 'src/crypto/conf/conf.c', + 'src/crypto/cpu-aarch64-linux.c', + 'src/crypto/cpu-arm-linux.c', 'src/crypto/cpu-arm.c', 'src/crypto/cpu-intel.c', 'src/crypto/crypto.c', + 'src/crypto/curve25519/curve25519.c', + 'src/crypto/curve25519/spake25519.c', + 'src/crypto/curve25519/x25519-x86_64.c', 'src/crypto/des/des.c', 'src/crypto/dh/check.c', 'src/crypto/dh/dh.c', 'src/crypto/dh/dh_asn1.c', - 'src/crypto/dh/dh_impl.c', 'src/crypto/dh/params.c', 'src/crypto/digest/digest.c', 'src/crypto/digest/digests.c', - 'src/crypto/directory_posix.c', - 'src/crypto/directory_win.c', 'src/crypto/dsa/dsa.c', 'src/crypto/dsa/dsa_asn1.c', - 'src/crypto/dsa/dsa_impl.c', 'src/crypto/ec/ec.c', 'src/crypto/ec/ec_asn1.c', 'src/crypto/ec/ec_key.c', 'src/crypto/ec/ec_montgomery.c', 'src/crypto/ec/oct.c', + 'src/crypto/ec/p224-64.c', 'src/crypto/ec/p256-64.c', + 'src/crypto/ec/p256-x86_64.c', 'src/crypto/ec/simple.c', 'src/crypto/ec/util-64.c', 'src/crypto/ec/wnaf.c', @@ -157,7 +158,6 @@ 'src/crypto/ecdsa/ecdsa_asn1.c', 'src/crypto/engine/engine.c', 'src/crypto/err/err.c', - 'src/crypto/evp/algorithm.c', 'src/crypto/evp/digestsign.c', 'src/crypto/evp/evp.c', 'src/crypto/evp/evp_asn1.c', @@ -168,6 +168,7 @@ 'src/crypto/evp/p_rsa.c', 'src/crypto/evp/p_rsa_asn1.c', 'src/crypto/evp/pbkdf.c', + 'src/crypto/evp/print.c', 'src/crypto/evp/sign.c', 'src/crypto/ex_data.c', 'src/crypto/hkdf/hkdf.c', @@ -181,6 +182,12 @@ 'src/crypto/modes/ctr.c', 'src/crypto/modes/gcm.c', 'src/crypto/modes/ofb.c', + 'src/crypto/newhope/error_correction.c', + 'src/crypto/newhope/newhope.c', + 'src/crypto/newhope/ntt.c', + 'src/crypto/newhope/poly.c', + 'src/crypto/newhope/precomp.c', + 'src/crypto/newhope/reduce.c', 'src/crypto/obj/obj.c', 'src/crypto/obj/obj_xref.c', 'src/crypto/pem/pem_all.c', @@ -198,7 +205,7 @@ 'src/crypto/poly1305/poly1305.c', 'src/crypto/poly1305/poly1305_arm.c', 'src/crypto/poly1305/poly1305_vec.c', - 'src/crypto/rand/hwrand.c', + 'src/crypto/rand/deterministic.c', 'src/crypto/rand/rand.c', 'src/crypto/rand/urandom.c', 'src/crypto/rand/windows.c', @@ -223,11 +230,13 @@ 'src/crypto/x509/a_sign.c', 'src/crypto/x509/a_strex.c', 'src/crypto/x509/a_verify.c', + 'src/crypto/x509/algorithm.c', 'src/crypto/x509/asn1_gen.c', 'src/crypto/x509/by_dir.c', 'src/crypto/x509/by_file.c', 'src/crypto/x509/i2d_pr.c', 'src/crypto/x509/pkcs7.c', + 'src/crypto/x509/rsa_pss.c', 'src/crypto/x509/t_crl.c', 'src/crypto/x509/t_req.c', 'src/crypto/x509/t_x509.c', @@ -303,6 +312,8 @@ ], 'boringssl_linux_aarch64_sources': [ 'linux-aarch64/crypto/aes/aesv8-armx64.S', + 'linux-aarch64/crypto/bn/armv8-mont.S', + 'linux-aarch64/crypto/chacha/chacha-armv8.S', 'linux-aarch64/crypto/modes/ghashv8-armx64.S', 'linux-aarch64/crypto/sha/sha1-armv8.S', 'linux-aarch64/crypto/sha/sha256-armv8.S', @@ -313,13 +324,13 @@ 'linux-arm/crypto/aes/aesv8-armx32.S', 'linux-arm/crypto/aes/bsaes-armv7.S', 'linux-arm/crypto/bn/armv4-mont.S', + 'linux-arm/crypto/chacha/chacha-armv4.S', 'linux-arm/crypto/modes/ghash-armv4.S', 'linux-arm/crypto/modes/ghashv8-armx32.S', 'linux-arm/crypto/sha/sha1-armv4-large.S', 'linux-arm/crypto/sha/sha256-armv4.S', 'linux-arm/crypto/sha/sha512-armv4.S', - 'src/crypto/chacha/chacha_vec_arm.S', - 'src/crypto/cpu-arm-asm.S', + 'src/crypto/curve25519/asm/x25519-asm-arm.S', 'src/crypto/poly1305/poly1305_arm_asm.S', ], 'boringssl_linux_x86_sources': [ @@ -329,6 +340,7 @@ 'linux-x86/crypto/bn/bn-586.S', 'linux-x86/crypto/bn/co-586.S', 'linux-x86/crypto/bn/x86-mont.S', + 'linux-x86/crypto/chacha/chacha-x86.S', 'linux-x86/crypto/md5/md5-586.S', 'linux-x86/crypto/modes/ghash-x86.S', 'linux-x86/crypto/rc4/rc4-586.S', @@ -345,15 +357,17 @@ 'linux-x86_64/crypto/bn/rsaz-x86_64.S', 'linux-x86_64/crypto/bn/x86_64-mont.S', 'linux-x86_64/crypto/bn/x86_64-mont5.S', + 'linux-x86_64/crypto/chacha/chacha-x86_64.S', + 'linux-x86_64/crypto/ec/p256-x86_64-asm.S', 'linux-x86_64/crypto/md5/md5-x86_64.S', 'linux-x86_64/crypto/modes/aesni-gcm-x86_64.S', 'linux-x86_64/crypto/modes/ghash-x86_64.S', 'linux-x86_64/crypto/rand/rdrand-x86_64.S', - 'linux-x86_64/crypto/rc4/rc4-md5-x86_64.S', 'linux-x86_64/crypto/rc4/rc4-x86_64.S', 'linux-x86_64/crypto/sha/sha1-x86_64.S', 'linux-x86_64/crypto/sha/sha256-x86_64.S', 'linux-x86_64/crypto/sha/sha512-x86_64.S', + 'src/crypto/curve25519/asm/x25519-asm-x86_64.S', ], 'boringssl_mac_x86_sources': [ 'mac-x86/crypto/aes/aes-586.S', @@ -362,6 +376,7 @@ 'mac-x86/crypto/bn/bn-586.S', 'mac-x86/crypto/bn/co-586.S', 'mac-x86/crypto/bn/x86-mont.S', + 'mac-x86/crypto/chacha/chacha-x86.S', 'mac-x86/crypto/md5/md5-586.S', 'mac-x86/crypto/modes/ghash-x86.S', 'mac-x86/crypto/rc4/rc4-586.S', @@ -378,15 +393,17 @@ 'mac-x86_64/crypto/bn/rsaz-x86_64.S', 'mac-x86_64/crypto/bn/x86_64-mont.S', 'mac-x86_64/crypto/bn/x86_64-mont5.S', + 'mac-x86_64/crypto/chacha/chacha-x86_64.S', + 'mac-x86_64/crypto/ec/p256-x86_64-asm.S', 'mac-x86_64/crypto/md5/md5-x86_64.S', 'mac-x86_64/crypto/modes/aesni-gcm-x86_64.S', 'mac-x86_64/crypto/modes/ghash-x86_64.S', 'mac-x86_64/crypto/rand/rdrand-x86_64.S', - 'mac-x86_64/crypto/rc4/rc4-md5-x86_64.S', 'mac-x86_64/crypto/rc4/rc4-x86_64.S', 'mac-x86_64/crypto/sha/sha1-x86_64.S', 'mac-x86_64/crypto/sha/sha256-x86_64.S', 'mac-x86_64/crypto/sha/sha512-x86_64.S', + 'src/crypto/curve25519/asm/x25519-asm-x86_64.S', ], 'boringssl_win_x86_sources': [ 'win-x86/crypto/aes/aes-586.asm', @@ -395,6 +412,7 @@ 'win-x86/crypto/bn/bn-586.asm', 'win-x86/crypto/bn/co-586.asm', 'win-x86/crypto/bn/x86-mont.asm', + 'win-x86/crypto/chacha/chacha-x86.asm', 'win-x86/crypto/md5/md5-586.asm', 'win-x86/crypto/modes/ghash-x86.asm', 'win-x86/crypto/rc4/rc4-586.asm', @@ -411,11 +429,12 @@ 'win-x86_64/crypto/bn/rsaz-x86_64.asm', 'win-x86_64/crypto/bn/x86_64-mont.asm', 'win-x86_64/crypto/bn/x86_64-mont5.asm', + 'win-x86_64/crypto/chacha/chacha-x86_64.asm', + 'win-x86_64/crypto/ec/p256-x86_64-asm.asm', 'win-x86_64/crypto/md5/md5-x86_64.asm', 'win-x86_64/crypto/modes/aesni-gcm-x86_64.asm', 'win-x86_64/crypto/modes/ghash-x86_64.asm', 'win-x86_64/crypto/rand/rdrand-x86_64.asm', - 'win-x86_64/crypto/rc4/rc4-md5-x86_64.asm', 'win-x86_64/crypto/rc4/rc4-x86_64.asm', 'win-x86_64/crypto/sha/sha1-x86_64.asm', 'win-x86_64/crypto/sha/sha256-x86_64.asm', diff --git a/packager/third_party/boringssl/boringssl_nacl.gyp b/packager/third_party/boringssl/boringssl_nacl.gyp index e560c29e9c..b99e00221d 100644 --- a/packager/third_party/boringssl/boringssl_nacl.gyp +++ b/packager/third_party/boringssl/boringssl_nacl.gyp @@ -31,10 +31,6 @@ ], 'include_dirs': [ 'src/include', - # This is for arm_arch.h, which is needed by some asm files. Since the - # asm files are generated and kept in a different directory, they - # cannot use relative paths to find this file. - 'src/crypto', ], 'direct_dependent_settings': { 'include_dirs': [ diff --git a/packager/third_party/boringssl/boringssl_tests.gyp b/packager/third_party/boringssl/boringssl_tests.gyp index a875bfecd4..a9160315b9 100644 --- a/packager/third_party/boringssl/boringssl_tests.gyp +++ b/packager/third_party/boringssl/boringssl_tests.gyp @@ -6,20 +6,24 @@ 'includes': [ 'boringssl_tests.gypi', ], - 'targets': [ - { - 'target_name': 'boringssl_unittests', - 'type': 'executable', - 'sources': [ - 'boringssl_unittest.cc', - ], - 'dependencies': [ - '<@(boringssl_test_targets)', - '../../base/base.gyp:base', - '../../base/base.gyp:run_all_unittests', - '../../base/base.gyp:test_support_base', - '../../testing/gtest.gyp:gtest', + 'conditions': [ + ['OS!="ios"', { + 'targets': [ + { + 'target_name': 'boringssl_unittests', + 'type': 'executable', + 'sources': [ + 'boringssl_unittest.cc', + ], + 'dependencies': [ + '<@(boringssl_test_targets)', + '../../base/base.gyp:base', + '../../base/base.gyp:run_all_unittests', + '../../base/base.gyp:test_support_base', + '../../testing/gtest.gyp:gtest', + ], + }, ], - }, + }], ], } diff --git a/packager/third_party/boringssl/boringssl_tests.gypi b/packager/third_party/boringssl/boringssl_tests.gypi index 675b913249..8542e5c20a 100644 --- a/packager/third_party/boringssl/boringssl_tests.gypi +++ b/packager/third_party/boringssl/boringssl_tests.gypi @@ -1,4 +1,4 @@ -# Copyright (c) 2014 The Chromium Authors. All rights reserved. +# Copyright (c) 2016 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. @@ -20,6 +20,20 @@ # https://crbug.com/429039 'msvs_disabled_warnings': [ 4267, ], }, + { + 'target_name': 'boringssl_asn1_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/asn1/asn1_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, { 'target_name': 'boringssl_base64_test', 'type': 'executable', @@ -76,6 +90,20 @@ # https://crbug.com/429039 'msvs_disabled_warnings': [ 4267, ], }, + { + 'target_name': 'boringssl_chacha_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/chacha/chacha_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, { 'target_name': 'boringssl_aead_test', 'type': 'executable', @@ -132,6 +160,48 @@ # https://crbug.com/429039 'msvs_disabled_warnings': [ 4267, ], }, + { + 'target_name': 'boringssl_ed25519_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/curve25519/ed25519_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, + { + 'target_name': 'boringssl_spake25519_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/curve25519/spake25519_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, + { + 'target_name': 'boringssl_x25519_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/curve25519/x25519_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, { 'target_name': 'boringssl_dh_test', 'type': 'executable', @@ -328,6 +398,62 @@ # https://crbug.com/429039 'msvs_disabled_warnings': [ 4267, ], }, + { + 'target_name': 'boringssl_newhope_statistical_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/newhope/newhope_statistical_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, + { + 'target_name': 'boringssl_newhope_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/newhope/newhope_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, + { + 'target_name': 'boringssl_newhope_vectors_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/newhope/newhope_vectors_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, + { + 'target_name': 'boringssl_obj_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/obj/obj_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, { 'target_name': 'boringssl_pkcs12_test', 'type': 'executable', @@ -342,6 +468,20 @@ # https://crbug.com/429039 'msvs_disabled_warnings': [ 4267, ], }, + { + 'target_name': 'boringssl_pkcs8_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/pkcs8/pkcs8_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, { 'target_name': 'boringssl_poly1305_test', 'type': 'executable', @@ -412,6 +552,20 @@ # https://crbug.com/429039 'msvs_disabled_warnings': [ 4267, ], }, + { + 'target_name': 'boringssl_x509_test', + 'type': 'executable', + 'dependencies': [ + 'boringssl.gyp:boringssl', + ], + 'sources': [ + 'src/crypto/x509/x509_test.cc', + '<@(boringssl_test_support_sources)', + ], + # TODO(davidben): Fix size_t truncations in BoringSSL. + # https://crbug.com/429039 + 'msvs_disabled_warnings': [ 4267, ], + }, { 'target_name': 'boringssl_tab_test', 'type': 'executable', @@ -472,15 +626,25 @@ 'variables': { 'boringssl_test_support_sources': [ 'src/crypto/test/file_test.cc', + 'src/crypto/test/file_test.h', 'src/crypto/test/malloc.cc', + 'src/crypto/test/scoped_types.h', + 'src/crypto/test/test_util.cc', + 'src/crypto/test/test_util.h', + 'src/ssl/test/async_bio.h', + 'src/ssl/test/packeted_bio.h', + 'src/ssl/test/scoped_types.h', + 'src/ssl/test/test_config.h', ], 'boringssl_test_targets': [ 'boringssl_aead_test', 'boringssl_aes_test', + 'boringssl_asn1_test', 'boringssl_base64_test', 'boringssl_bio_test', 'boringssl_bn_test', 'boringssl_bytestring_test', + 'boringssl_chacha_test', 'boringssl_cipher_test', 'boringssl_cmac_test', 'boringssl_constant_time_test', @@ -489,6 +653,7 @@ 'boringssl_dsa_test', 'boringssl_ec_test', 'boringssl_ecdsa_test', + 'boringssl_ed25519_test', 'boringssl_err_test', 'boringssl_evp_extra_test', 'boringssl_evp_test', @@ -497,17 +662,25 @@ 'boringssl_hkdf_test', 'boringssl_hmac_test', 'boringssl_lhash_test', + 'boringssl_newhope_statistical_test', + 'boringssl_newhope_test', + 'boringssl_newhope_vectors_test', + 'boringssl_obj_test', 'boringssl_pbkdf_test', 'boringssl_pkcs12_test', 'boringssl_pkcs7_test', + 'boringssl_pkcs8_test', 'boringssl_poly1305_test', 'boringssl_pqueue_test', 'boringssl_refcount_test', 'boringssl_rsa_test', + 'boringssl_spake25519_test', 'boringssl_ssl_test', 'boringssl_tab_test', 'boringssl_thread_test', 'boringssl_v3name_test', + 'boringssl_x25519_test', + 'boringssl_x509_test', ], } } diff --git a/packager/third_party/boringssl/boringssl_unittest.cc b/packager/third_party/boringssl/boringssl_unittest.cc index 75b9db80bf..068dafffa6 100644 --- a/packager/third_party/boringssl/boringssl_unittest.cc +++ b/packager/third_party/boringssl/boringssl_unittest.cc @@ -161,6 +161,10 @@ TEST(BoringSSL, ByteString) { TestSimple("bytestring_test"); } +TEST(BoringSSL, ChaCha) { + TestSimple("chacha_test"); +} + TEST(BoringSSL, Cipher) { base::FilePath data_file; ASSERT_TRUE(CryptoCipherTestPath(&data_file)); @@ -200,6 +204,19 @@ TEST(BoringSSL, ECDSA) { TestSimple("ecdsa_test"); } +TEST(BoringSSL, ED25519) { + base::FilePath data_file; + ASSERT_TRUE(BoringSSLPath(&data_file)); + data_file = data_file.Append(FILE_PATH_LITERAL("crypto")); + data_file = data_file.Append(FILE_PATH_LITERAL("curve25519")); + data_file = data_file.Append(FILE_PATH_LITERAL("ed25519_tests.txt")); + + std::vector args; + args.push_back(data_file.value()); + + TestProcess("ed25519_test", args); +} + TEST(BoringSSL, ERR) { TestSimple("err_test"); } @@ -250,6 +267,23 @@ TEST(BoringSSL, LH) { TestSimple("lhash_test"); } +TEST(BoringSSL, NewHope) { + TestSimple("newhope_test"); +} + +TEST(BoringSSL, NewHopeVectors) { + base::FilePath data_file; + ASSERT_TRUE(BoringSSLPath(&data_file)); + data_file = data_file.Append(FILE_PATH_LITERAL("crypto")); + data_file = data_file.Append(FILE_PATH_LITERAL("newhope")); + data_file = data_file.Append(FILE_PATH_LITERAL("newhope_test.txt")); + + std::vector args; + args.push_back(data_file.value()); + + TestProcess("newhope_vectors_test", args); +} + TEST(BoringSSL, PBKDF) { TestSimple("pbkdf_test"); } @@ -271,6 +305,10 @@ TEST(BoringSSL, PKCS7) { TestSimple("pkcs7_test"); } +TEST(BoringSSL, PKCS8) { + TestSimple("pkcs8_test"); +} + TEST(BoringSSL, PKCS12) { TestSimple("pkcs12_test"); } @@ -302,3 +340,11 @@ TEST(BoringSSL, Thread) { TEST(BoringSSL, V3NameTest) { TestSimple("v3name_test"); } + +TEST(BoringSSL, X25519) { + TestSimple("x25519_test"); +} + +TEST(BoringSSL, X509) { + TestSimple("x509_test"); +} diff --git a/packager/third_party/boringssl/err_data.c b/packager/third_party/boringssl/err_data.c index 43a4c7e605..32976ad424 100644 --- a/packager/third_party/boringssl/err_data.c +++ b/packager/third_party/boringssl/err_data.c @@ -54,178 +54,166 @@ OPENSSL_COMPILE_ASSERT(ERR_LIB_USER == 32, library_values_changed_32); OPENSSL_COMPILE_ASSERT(ERR_NUM_LIBS == 33, library_values_changed_num); const uint32_t kOpenSSLReasonValues[] = { - 0xc3207ba, - 0xc3287d4, - 0xc3307e3, - 0xc3387f3, - 0xc340802, - 0xc34881b, - 0xc350827, - 0xc358844, - 0xc360856, - 0xc368864, - 0xc370874, - 0xc378881, - 0xc380891, - 0xc38889c, - 0xc3908b2, - 0xc3988c1, - 0xc3a08d5, - 0xc3a87c7, - 0xc3b00b0, - 0x10321478, - 0x10329484, - 0x1033149d, - 0x103394b0, - 0x10340de1, - 0x103494cf, - 0x103514e4, - 0x10359516, - 0x1036152f, - 0x10369544, - 0x10371562, - 0x10379571, - 0x1038158d, - 0x103895a8, - 0x103915b7, - 0x103995d3, - 0x103a15ee, - 0x103a9605, - 0x103b1616, - 0x103b962a, - 0x103c1649, - 0x103c9658, - 0x103d166f, - 0x103d9682, - 0x103e0b6c, - 0x103e96b3, - 0x103f16c6, - 0x103f96e0, - 0x104016f0, - 0x10409704, - 0x1041171a, - 0x10419732, - 0x10421747, - 0x1042975b, - 0x1043176d, - 0x104385d0, - 0x104408c1, - 0x10449782, - 0x10451799, - 0x104597ae, - 0x104617bc, - 0x10469695, - 0x104714f7, - 0x104787c7, - 0x104800b0, - 0x104894c3, - 0x14320b4f, - 0x14328b5d, - 0x14330b6c, - 0x14338b7e, + 0xc320838, + 0xc328852, + 0xc330861, + 0xc338871, + 0xc340880, + 0xc348899, + 0xc3508a5, + 0xc3588c2, + 0xc3608d4, + 0xc3688e2, + 0xc3708f2, + 0xc3788ff, + 0xc38090f, + 0xc38891a, + 0xc390930, + 0xc39893f, + 0xc3a0953, + 0xc3a8845, + 0xc3b00ea, + 0x10320845, + 0x1032939a, + 0x103313a6, + 0x103393bf, + 0x103413d2, + 0x10348e7a, + 0x10350c19, + 0x103593e5, + 0x103613fa, + 0x1036940d, + 0x1037142c, + 0x10379445, + 0x1038145a, + 0x10389478, + 0x10391487, + 0x103994a3, + 0x103a14be, + 0x103a94cd, + 0x103b14e9, + 0x103b9504, + 0x103c151b, + 0x103c80ea, + 0x103d152c, + 0x103d9540, + 0x103e155f, + 0x103e956e, + 0x103f1585, + 0x103f9598, + 0x10400bea, + 0x104095ab, + 0x104115c9, + 0x104195dc, + 0x104215f6, + 0x10429606, + 0x1043161a, + 0x10439630, + 0x10441648, + 0x1044965d, + 0x10451671, + 0x10459683, + 0x104605fb, + 0x1046893f, + 0x10471698, + 0x104796af, + 0x104816c4, + 0x104896d2, + 0x14320bcd, + 0x14328bdb, + 0x14330bea, + 0x14338bfc, + 0x143400ac, + 0x143480ea, 0x18320083, - 0x18328e47, - 0x18340e75, - 0x18348e89, - 0x18358ec0, - 0x18368eed, - 0x18370f00, - 0x18378f14, - 0x18380f38, - 0x18388f46, - 0x18390f5c, - 0x18398f70, - 0x183a0f80, - 0x183b0f90, - 0x183b8fa5, - 0x183c8fd0, - 0x183d0fe4, - 0x183d8ff4, - 0x183e0b9b, - 0x183e9001, - 0x183f1013, - 0x183f901e, - 0x1840102e, - 0x1840903f, - 0x18411050, - 0x18419062, - 0x1842108b, - 0x184290bd, - 0x184310cc, - 0x18451135, - 0x1845914b, - 0x18461166, - 0x18468ed8, - 0x184709d9, - 0x18478094, - 0x18480fbc, - 0x18489101, - 0x18490e5d, - 0x18498e9e, - 0x184a119c, - 0x184a9119, - 0x184b10e0, - 0x184b8e37, - 0x184c10a4, - 0x184c866b, - 0x184d1181, - 0x203211c3, - 0x243211cf, - 0x24328907, - 0x243311e1, - 0x243391ee, - 0x243411fb, - 0x2434920d, - 0x2435121c, - 0x24359239, - 0x24361246, - 0x24369254, - 0x24371262, - 0x24379270, - 0x24381279, - 0x24389286, - 0x24391299, - 0x28320b8f, - 0x28328b9b, - 0x28330b6c, - 0x28338bae, - 0x2c322b64, - 0x2c32ab72, - 0x2c332b84, - 0x2c33ab96, - 0x2c342baa, - 0x2c34abbc, - 0x2c352bd7, - 0x2c35abe9, - 0x2c362bfc, - 0x2c3682f3, - 0x2c372c09, - 0x2c37ac1b, - 0x2c382c2e, - 0x2c38ac3c, - 0x2c392c4c, - 0x2c39ac5e, - 0x2c3a2c72, - 0x2c3aac83, - 0x2c3b1359, - 0x2c3bac94, - 0x2c3c2ca8, - 0x2c3cacbe, - 0x2c3d2cd7, - 0x2c3dad05, - 0x2c3e2d13, - 0x2c3ead2b, - 0x2c3f2d43, - 0x2c3fad50, - 0x2c402d73, - 0x2c40ad92, - 0x2c4111c3, - 0x2c41ada3, - 0x2c422db6, - 0x2c429135, - 0x2c432dc7, - 0x2c4386a2, - 0x2c442cf4, + 0x18328ed0, + 0x183300ac, + 0x18338ee6, + 0x18340efa, + 0x183480ea, + 0x18350f0f, + 0x18358f27, + 0x18360f3c, + 0x18368f50, + 0x18370f74, + 0x18378f8a, + 0x18380f9e, + 0x18388fae, + 0x18390a57, + 0x18398fbe, + 0x183a0fd3, + 0x183a8fe7, + 0x183b0c25, + 0x183b8ff4, + 0x183c1006, + 0x183c9011, + 0x183d1021, + 0x183d9032, + 0x183e1043, + 0x183e9055, + 0x183f107e, + 0x183f9097, + 0x184010af, + 0x184086d3, + 0x203210d6, + 0x243210e2, + 0x24328985, + 0x243310f4, + 0x24339101, + 0x2434110e, + 0x24349120, + 0x2435112f, + 0x2435914c, + 0x24361159, + 0x24369167, + 0x24371175, + 0x24379183, + 0x2438118c, + 0x24389199, + 0x243911ac, + 0x28320c0d, + 0x28328c25, + 0x28330bea, + 0x28338c38, + 0x28340c19, + 0x283480ac, + 0x283500ea, + 0x2c32274a, + 0x2c32a758, + 0x2c33276a, + 0x2c33a77c, + 0x2c342790, + 0x2c34a7a2, + 0x2c3527bd, + 0x2c35a7cf, + 0x2c3627e2, + 0x2c36832d, + 0x2c3727ef, + 0x2c37a801, + 0x2c382814, + 0x2c38a82b, + 0x2c392839, + 0x2c39a849, + 0x2c3a285b, + 0x2c3aa86f, + 0x2c3b2880, + 0x2c3ba89f, + 0x2c3c28b3, + 0x2c3ca8c9, + 0x2c3d28e2, + 0x2c3da8ff, + 0x2c3e2910, + 0x2c3ea91e, + 0x2c3f2936, + 0x2c3fa94e, + 0x2c40295b, + 0x2c4090d6, + 0x2c41296c, + 0x2c41a97f, + 0x2c4210af, + 0x2c42a990, + 0x2c430720, + 0x2c43a891, 0x30320000, 0x30328015, 0x3033001f, @@ -235,472 +223,449 @@ const uint32_t kOpenSSLReasonValues[] = { 0x3035006b, 0x30358083, 0x30360094, - 0x303680a1, - 0x303700b0, - 0x303780bd, - 0x303800d0, - 0x303880eb, - 0x30390100, - 0x30398114, - 0x303a0128, - 0x303a8139, - 0x303b0152, - 0x303b816f, - 0x303c017d, - 0x303c8191, - 0x303d01a1, - 0x303d81ba, - 0x303e01ca, - 0x303e81dd, - 0x303f01ec, - 0x303f81f8, - 0x3040020d, - 0x3040821d, - 0x30410234, - 0x30418241, - 0x30420254, - 0x30428263, - 0x30430278, - 0x30438299, - 0x304402ac, - 0x304482bf, - 0x304502d8, - 0x304582f3, - 0x30460310, - 0x30468329, - 0x30470337, - 0x30478348, - 0x30480357, - 0x3048836f, - 0x30490381, - 0x30498395, - 0x304a03b4, - 0x304a83c7, - 0x304b03d2, - 0x304b83e1, - 0x304c03f2, - 0x304c83fe, - 0x304d0414, - 0x304d8422, - 0x304e0438, - 0x304e844a, - 0x304f045c, - 0x304f846f, - 0x30500482, - 0x30508493, - 0x305104a3, - 0x305184bb, - 0x305204d0, - 0x305284e8, - 0x305304fc, - 0x30538514, - 0x3054052d, - 0x30548546, - 0x30550563, - 0x3055856e, - 0x30560586, - 0x30568596, - 0x305705a7, - 0x305785ba, - 0x305805d0, - 0x305885d9, - 0x305905ee, - 0x30598601, - 0x305a0610, - 0x305a8630, - 0x305b063f, - 0x305b864b, - 0x305c066b, - 0x305c8687, - 0x305d0698, - 0x305d86a2, - 0x34320ac9, - 0x34328add, - 0x34330afa, - 0x34338b0d, - 0x34340b1c, - 0x34348b39, + 0x303680ac, + 0x303700b9, + 0x303780c8, + 0x303800ea, + 0x303880f7, + 0x3039010a, + 0x30398125, + 0x303a013a, + 0x303a814e, + 0x303b0162, + 0x303b8173, + 0x303c018c, + 0x303c81a9, + 0x303d01b7, + 0x303d81cb, + 0x303e01db, + 0x303e81f4, + 0x303f0204, + 0x303f8217, + 0x30400226, + 0x30408232, + 0x30410247, + 0x30418257, + 0x3042026e, + 0x3042827b, + 0x3043028e, + 0x3043829d, + 0x304402b2, + 0x304482d3, + 0x304502e6, + 0x304582f9, + 0x30460312, + 0x3046832d, + 0x3047034a, + 0x30478363, + 0x30480371, + 0x30488382, + 0x30490391, + 0x304983a9, + 0x304a03bb, + 0x304a83cf, + 0x304b03ee, + 0x304b8401, + 0x304c040c, + 0x304c841d, + 0x304d0429, + 0x304d843f, + 0x304e044d, + 0x304e8463, + 0x304f0475, + 0x304f8487, + 0x3050049a, + 0x305084ad, + 0x305104be, + 0x305184ce, + 0x305204e6, + 0x305284fb, + 0x30530513, + 0x30538527, + 0x3054053f, + 0x30548558, + 0x30550571, + 0x3055858e, + 0x30560599, + 0x305685b1, + 0x305705c1, + 0x305785d2, + 0x305805e5, + 0x305885fb, + 0x30590604, + 0x30598619, + 0x305a062c, + 0x305a863b, + 0x305b065b, + 0x305b866a, + 0x305c068b, + 0x305c86a7, + 0x305d06b3, + 0x305d86d3, + 0x305e06ef, + 0x305e8700, + 0x305f0716, + 0x305f8720, + 0x34320b47, + 0x34328b5b, + 0x34330b78, + 0x34338b8b, + 0x34340b9a, + 0x34348bb7, 0x3c320083, - 0x3c328bd8, - 0x3c330bf1, - 0x3c338c0c, - 0x3c340c29, - 0x3c348c44, - 0x3c350c5f, - 0x3c358c74, - 0x3c360c8d, - 0x3c368ca5, - 0x3c370cb6, - 0x3c378cc4, - 0x3c380cd1, - 0x3c388ce5, - 0x3c390b9b, - 0x3c398cf9, - 0x3c3a0d0d, - 0x3c3a8881, - 0x3c3b0d1d, - 0x3c3b8d38, - 0x3c3c0d4a, - 0x3c3c8d60, - 0x3c3d0d6a, - 0x3c3d8d7e, - 0x3c3e0d8c, - 0x3c3e8db1, - 0x3c3f0bc4, - 0x3c3f8d9a, - 0x403217d3, - 0x403297e9, - 0x40331817, - 0x40339821, - 0x40341838, - 0x40349856, - 0x40351866, - 0x40359878, - 0x40361885, - 0x40369891, - 0x403718a6, - 0x403798bb, - 0x403818cd, - 0x403898d8, - 0x403918ea, - 0x40398de1, - 0x403a18fa, - 0x403a990d, - 0x403b192e, - 0x403b993f, - 0x403c194f, - 0x403c8064, - 0x403d195b, - 0x403d9977, - 0x403e198d, - 0x403e999c, - 0x403f19af, - 0x403f99c9, - 0x404019d7, - 0x404099ec, - 0x40411a00, - 0x40419a1d, - 0x40421a36, - 0x40429a51, - 0x40431a6a, - 0x40439a7d, - 0x40441a91, - 0x40449aa9, - 0x40451ab9, - 0x40459ac7, - 0x40461ae5, - 0x40468094, - 0x40471afa, - 0x40479b0c, - 0x40481b30, - 0x40489b50, - 0x40491b64, - 0x40499b79, - 0x404a1b92, - 0x404a9bcc, - 0x404b1be6, - 0x404b9c04, - 0x404c1c1f, - 0x404c9c39, - 0x404d1c50, - 0x404d9c78, - 0x404e1c8f, - 0x404e9cab, - 0x404f1cc7, - 0x404f9ce8, - 0x40501d0a, - 0x40509d26, - 0x40511d3a, - 0x40519d47, - 0x40521d5e, - 0x40529d6e, - 0x40531d7e, - 0x40539d92, - 0x40541dad, - 0x40549dbd, - 0x40551dd4, - 0x40559de3, - 0x40561dfe, - 0x40569e16, - 0x40571e32, - 0x40579e4b, - 0x40581e5e, - 0x40589e73, - 0x40591e96, - 0x40599ea4, - 0x405a1eb1, - 0x405a9eca, - 0x405b1ee2, - 0x405b9ef5, - 0x405c1f0a, - 0x405c9f1c, - 0x405d1f31, - 0x405d9f41, - 0x405e1f5a, - 0x405e9f6e, - 0x405f1f7e, - 0x405f9f96, - 0x40601fa7, - 0x40609fba, - 0x40611fcb, - 0x40619fe9, - 0x40621ffa, - 0x4062a007, - 0x4063201e, - 0x4063a05f, - 0x40642076, - 0x4064a083, - 0x40652091, - 0x4065a0b3, - 0x406620db, - 0x4066a0f0, - 0x40672107, - 0x4067a118, - 0x40682129, - 0x4068a13a, - 0x4069214f, - 0x4069a166, - 0x406a2177, - 0x406aa190, - 0x406b21ab, - 0x406ba1c2, - 0x406c222f, - 0x406ca250, - 0x406d2263, - 0x406da284, - 0x406e229f, - 0x406ea2e8, - 0x406f2309, - 0x406fa32f, - 0x4070234f, - 0x4070a36b, - 0x407124f8, - 0x4071a51b, - 0x40722531, - 0x4072a550, - 0x40732568, - 0x4073a588, - 0x407427b2, - 0x4074a7d7, - 0x407527f2, - 0x4075a811, - 0x40762840, - 0x4076a868, - 0x40772899, - 0x4077a8b8, - 0x407828f2, - 0x4078a909, - 0x4079291c, - 0x4079a939, - 0x407a0782, - 0x407aa94b, - 0x407b295e, - 0x407ba977, - 0x407c298f, - 0x407c90bd, - 0x407d29a3, - 0x407da9bd, - 0x407e29ce, - 0x407ea9e2, - 0x407f29f0, - 0x407faa0b, - 0x40801286, - 0x4080aa30, - 0x40812a52, - 0x4081aa6d, - 0x40822a82, - 0x4082aa9a, - 0x40832ab2, - 0x4083aac9, - 0x40842adf, - 0x4084aaeb, - 0x40852afe, - 0x4085ab13, - 0x40862b25, - 0x4086ab3a, - 0x40872b43, - 0x40879c66, - 0x40880083, - 0x4088a03e, - 0x40890a17, - 0x4089a1da, - 0x408a1bb5, - 0x408aa204, - 0x408b2881, - 0x408ba8dd, - 0x408c22ba, - 0x41f42423, - 0x41f924b5, - 0x41fe23a8, - 0x41fea5d9, - 0x41ff26ca, - 0x4203243c, - 0x4208245e, - 0x4208a49a, - 0x4209238c, - 0x4209a4d4, - 0x420a23e3, - 0x420aa3c3, - 0x420b2403, - 0x420ba47c, - 0x420c26e6, - 0x420ca5a6, - 0x420d25c0, - 0x420da5f7, - 0x42122611, - 0x421726ad, - 0x4217a653, - 0x421c2675, - 0x421f2630, - 0x422126fd, - 0x42262690, - 0x422b2796, - 0x422ba75f, - 0x422c277e, - 0x422ca739, - 0x422d2718, - 0x443206ad, - 0x443286bc, - 0x443306c8, - 0x443386d6, - 0x443406e9, - 0x443486fa, - 0x44350701, - 0x4435870b, - 0x4436071e, - 0x44368734, - 0x44370746, - 0x44378753, - 0x44380762, - 0x4438876a, - 0x44390782, - 0x44398790, - 0x443a07a3, - 0x4c3212b0, - 0x4c3292c0, - 0x4c3312d3, - 0x4c3392f3, - 0x4c340094, - 0x4c3480b0, - 0x4c3512ff, - 0x4c35930d, - 0x4c361329, - 0x4c36933c, - 0x4c37134b, - 0x4c379359, - 0x4c38136e, - 0x4c38937a, - 0x4c39139a, - 0x4c3993c4, - 0x4c3a13dd, - 0x4c3a93f6, - 0x4c3b05d0, - 0x4c3b940f, - 0x4c3c1421, - 0x4c3c9430, - 0x4c3d10bd, - 0x4c3d9449, - 0x4c3e1456, - 0x50322dd9, - 0x5032ade8, - 0x50332df3, - 0x5033ae03, - 0x50342e1c, - 0x5034ae36, - 0x50352e44, - 0x5035ae5a, - 0x50362e6c, - 0x5036ae82, - 0x50372e9b, - 0x5037aeae, - 0x50382ec6, - 0x5038aed7, - 0x50392eec, - 0x5039af00, - 0x503a2f20, - 0x503aaf36, - 0x503b2f4e, - 0x503baf60, - 0x503c2f7c, - 0x503caf93, - 0x503d2fac, - 0x503dafc2, - 0x503e2fcf, - 0x503eafe5, - 0x503f2ff7, - 0x503f8348, - 0x5040300a, - 0x5040b01a, - 0x50413034, - 0x5041b043, - 0x5042305d, - 0x5042b07a, - 0x5043308a, - 0x5043b09a, - 0x504430a9, - 0x50448414, - 0x504530bd, - 0x5045b0db, - 0x504630ee, - 0x5046b104, - 0x50473116, - 0x5047b12b, - 0x50483151, - 0x5048b15f, - 0x50493172, - 0x5049b187, - 0x504a319d, - 0x504ab1ad, - 0x504b31cd, - 0x504bb1e0, - 0x504c3203, - 0x504cb231, - 0x504d3243, - 0x504db260, - 0x504e327b, - 0x504eb297, - 0x504f32a9, - 0x504fb2c0, - 0x505032cf, - 0x50508687, - 0x505132e2, - 0x58320e1f, - 0x68320de1, - 0x68328b9b, - 0x68330bae, - 0x68338def, - 0x68340dff, - 0x683480b0, - 0x6c320dbd, - 0x6c328b7e, - 0x6c330dc8, - 0x7432098d, - 0x783208f2, - 0x78328907, - 0x78330913, + 0x3c328c62, + 0x3c330c7b, + 0x3c338c96, + 0x3c340cb3, + 0x3c348cdd, + 0x3c350cf8, + 0x3c358d0d, + 0x3c360d26, + 0x3c368d3e, + 0x3c370d4f, + 0x3c378d5d, + 0x3c380d6a, + 0x3c388d7e, + 0x3c390c25, + 0x3c398d92, + 0x3c3a0da6, + 0x3c3a88ff, + 0x3c3b0db6, + 0x3c3b8dd1, + 0x3c3c0de3, + 0x3c3c8df9, + 0x3c3d0e03, + 0x3c3d8e17, + 0x3c3e0e25, + 0x3c3e8e4a, + 0x3c3f0c4e, + 0x3c3f8e33, + 0x3c4000ac, + 0x3c4080ea, + 0x3c410cce, + 0x403216e9, + 0x403296ff, + 0x4033172d, + 0x40339737, + 0x4034174e, + 0x4034976c, + 0x4035177c, + 0x4035978e, + 0x4036179b, + 0x403697a7, + 0x403717bc, + 0x403797ce, + 0x403817d9, + 0x403897eb, + 0x40390e7a, + 0x403997fb, + 0x403a180e, + 0x403a982f, + 0x403b1840, + 0x403b9850, + 0x403c0064, + 0x403c8083, + 0x403d185c, + 0x403d9872, + 0x403e1881, + 0x403e9894, + 0x403f18ae, + 0x403f98bc, + 0x404018d1, + 0x404098e5, + 0x40411902, + 0x4041991d, + 0x40421936, + 0x40429949, + 0x4043195d, + 0x40439975, + 0x4044198c, + 0x404480ac, + 0x404519a1, + 0x404599b3, + 0x404619d7, + 0x404699f7, + 0x40471a05, + 0x40479a19, + 0x40481a2e, + 0x40489a47, + 0x40491a5e, + 0x40499a78, + 0x404a1a8f, + 0x404a9aad, + 0x404b1ac5, + 0x404b9adc, + 0x404c1af2, + 0x404c9b04, + 0x404d1b25, + 0x404d9b47, + 0x404e1b5b, + 0x404e9b68, + 0x404f1b7f, + 0x404f9b8f, + 0x40501b9f, + 0x40509bb3, + 0x40511bce, + 0x40519bde, + 0x40521bf5, + 0x40529c07, + 0x40531c1f, + 0x40539c32, + 0x40541c47, + 0x40549c6a, + 0x40551c78, + 0x40559c95, + 0x40561ca2, + 0x40569cbb, + 0x40571cd3, + 0x40579ce6, + 0x40581cfb, + 0x40589d0d, + 0x40591d1d, + 0x40599d36, + 0x405a1d4a, + 0x405a9d5a, + 0x405b1d72, + 0x405b9d83, + 0x405c1d96, + 0x405c9da7, + 0x405d1db4, + 0x405d9dcb, + 0x405e1deb, + 0x405e8a95, + 0x405f1e0c, + 0x405f9e19, + 0x40601e27, + 0x40609e49, + 0x40611e71, + 0x40619e86, + 0x40621e9d, + 0x40629eae, + 0x40631ebf, + 0x40639ed4, + 0x40641eeb, + 0x40649efc, + 0x40651f17, + 0x40659f2e, + 0x40661f46, + 0x40669f70, + 0x40671f9b, + 0x40679fbc, + 0x40681fcf, + 0x40689ff0, + 0x40692022, + 0x4069a050, + 0x406a2071, + 0x406aa091, + 0x406b2219, + 0x406ba23c, + 0x406c2252, + 0x406ca47e, + 0x406d24ad, + 0x406da4d5, + 0x406e24ee, + 0x406ea506, + 0x406f2525, + 0x406fa53a, + 0x4070254d, + 0x4070a56a, + 0x40710800, + 0x4071a57c, + 0x4072258f, + 0x4072a5a8, + 0x407325c0, + 0x4073935c, + 0x407425d4, + 0x4074a5ee, + 0x407525ff, + 0x4075a613, + 0x40762621, + 0x40769199, + 0x40772646, + 0x4077a668, + 0x40782683, + 0x4078a698, + 0x407926af, + 0x4079a6c5, + 0x407a26d1, + 0x407aa6e4, + 0x407b26f9, + 0x407ba70b, + 0x407c2720, + 0x407ca729, + 0x407d200b, + 0x41f42144, + 0x41f921d6, + 0x41fe20c9, + 0x41fea2a5, + 0x41ff2396, + 0x4203215d, + 0x4208217f, + 0x4208a1bb, + 0x420920ad, + 0x4209a1f5, + 0x420a2104, + 0x420aa0e4, + 0x420b2124, + 0x420ba19d, + 0x420c23b2, + 0x420ca272, + 0x420d228c, + 0x420da2c3, + 0x421222dd, + 0x42172379, + 0x4217a31f, + 0x421c2341, + 0x421f22fc, + 0x422123c9, + 0x4226235c, + 0x422b2462, + 0x422ba42b, + 0x422c244a, + 0x422ca405, + 0x422d23e4, + 0x4432072b, + 0x4432873a, + 0x44330746, + 0x44338754, + 0x44340767, + 0x44348778, + 0x4435077f, + 0x44358789, + 0x4436079c, + 0x443687b2, + 0x443707c4, + 0x443787d1, + 0x443807e0, + 0x443887e8, + 0x44390800, + 0x4439880e, + 0x443a0821, + 0x4c3211c3, + 0x4c3291d3, + 0x4c3311e6, + 0x4c339206, + 0x4c3400ac, + 0x4c3480ea, + 0x4c351212, + 0x4c359220, + 0x4c36123c, + 0x4c36924f, + 0x4c37125e, + 0x4c37926c, + 0x4c381281, + 0x4c38928d, + 0x4c3912ad, + 0x4c3992d7, + 0x4c3a12f0, + 0x4c3a9309, + 0x4c3b05fb, + 0x4c3b9322, + 0x4c3c1334, + 0x4c3c9343, + 0x4c3d135c, + 0x4c3d936b, + 0x4c3e1378, + 0x503229a2, + 0x5032a9b1, + 0x503329bc, + 0x5033a9cc, + 0x503429e5, + 0x5034a9ff, + 0x50352a0d, + 0x5035aa23, + 0x50362a35, + 0x5036aa4b, + 0x50372a64, + 0x5037aa77, + 0x50382a8f, + 0x5038aaa0, + 0x50392ab5, + 0x5039aac9, + 0x503a2ae9, + 0x503aaaff, + 0x503b2b17, + 0x503bab29, + 0x503c2b45, + 0x503cab5c, + 0x503d2b75, + 0x503dab8b, + 0x503e2b98, + 0x503eabae, + 0x503f2bc0, + 0x503f8382, + 0x50402bd3, + 0x5040abe3, + 0x50412bfd, + 0x5041ac0c, + 0x50422c26, + 0x5042ac43, + 0x50432c53, + 0x5043ac63, + 0x50442c72, + 0x5044843f, + 0x50452c86, + 0x5045aca4, + 0x50462cb7, + 0x5046accd, + 0x50472cdf, + 0x5047acf4, + 0x50482d1a, + 0x5048ad28, + 0x50492d3b, + 0x5049ad50, + 0x504a2d66, + 0x504aad76, + 0x504b2d96, + 0x504bada9, + 0x504c2dcc, + 0x504cadfa, + 0x504d2e0c, + 0x504dae29, + 0x504e2e44, + 0x504eae60, + 0x504f2e72, + 0x504fae89, + 0x50502e98, + 0x505086ef, + 0x50512eab, + 0x58320eb8, + 0x68320e7a, + 0x68328c25, + 0x68330c38, + 0x68338e88, + 0x68340e98, + 0x683480ea, + 0x6c320e56, + 0x6c328bfc, + 0x6c330e61, + 0x74320a0b, + 0x78320970, + 0x78328985, + 0x78330991, 0x78338083, - 0x78340922, - 0x78348937, - 0x78350956, - 0x78358978, - 0x7836098d, - 0x783689a3, - 0x783709b3, - 0x783789c6, - 0x783809d9, - 0x783889eb, - 0x783909f8, - 0x78398a17, - 0x783a0a2c, - 0x783a8a3a, - 0x783b0a44, - 0x783b8a58, - 0x783c0a6f, - 0x783c8a84, - 0x783d0a9b, - 0x783d8ab0, - 0x783e0a06, - 0x7c3211b2, + 0x783409a0, + 0x783489b5, + 0x783509d4, + 0x783589f6, + 0x78360a0b, + 0x78368a21, + 0x78370a31, + 0x78378a44, + 0x78380a57, + 0x78388a69, + 0x78390a76, + 0x78398a95, + 0x783a0aaa, + 0x783a8ab8, + 0x783b0ac2, + 0x783b8ad6, + 0x783c0aed, + 0x783c8b02, + 0x783d0b19, + 0x783d8b2e, + 0x783e0a84, + 0x7c3210c5, }; const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]); @@ -714,8 +679,10 @@ const char kOpenSSLReasonStringData[] = "BN_LIB\0" "BOOLEAN_IS_WRONG_LENGTH\0" "BUFFER_TOO_SMALL\0" + "CONTEXT_NOT_INITIALISED\0" "DECODE_ERROR\0" "DEPTH_EXCEEDED\0" + "DIGEST_AND_KEY_TYPE_NOT_SUPPORTED\0" "ENCODE_ERROR\0" "ERROR_GETTING_TIME\0" "EXPECTING_AN_ASN1_SEQUENCE\0" @@ -756,7 +723,6 @@ const char kOpenSSLReasonStringData[] = "INVALID_UNIVERSALSTRING_LENGTH\0" "INVALID_UTF8STRING\0" "LIST_ERROR\0" - "MALLOC_FAILURE\0" "MISSING_ASN1_EOS\0" "MISSING_EOC\0" "MISSING_SECOND_NUMBER\0" @@ -788,10 +754,13 @@ const char kOpenSSLReasonStringData[] = "UNEXPECTED_EOC\0" "UNIVERSALSTRING_IS_WRONG_LENGTH\0" "UNKNOWN_FORMAT\0" + "UNKNOWN_MESSAGE_DIGEST_ALGORITHM\0" + "UNKNOWN_SIGNATURE_ALGORITHM\0" "UNKNOWN_TAG\0" "UNSUPPORTED_ANY_DEFINED_BY_TYPE\0" "UNSUPPORTED_PUBLIC_KEY_TYPE\0" "UNSUPPORTED_TYPE\0" + "WRONG_PUBLIC_KEY_TYPE\0" "WRONG_TAG\0" "WRONG_TYPE\0" "BAD_FOPEN_MODE\0" @@ -864,6 +833,7 @@ const char kOpenSSLReasonStringData[] = "MODULUS_TOO_LARGE\0" "NO_PRIVATE_VALUE\0" "BAD_Q_VALUE\0" + "BAD_VERSION\0" "MISSING_PARAMETERS\0" "NEED_NEW_SETUP_VALUES\0" "BIGNUM_OUT_OF_RANGE\0" @@ -871,6 +841,7 @@ const char kOpenSSLReasonStringData[] = "D2I_ECPKPARAMETERS_FAILURE\0" "EC_GROUP_NEW_BY_NAME_FAILURE\0" "GROUP2PKPARAMETERS_FAILURE\0" + "GROUP_MISMATCH\0" "I2D_ECPKPARAMETERS_FAILURE\0" "INCOMPATIBLE_OBJECTS\0" "INVALID_COMPRESSED_POINT\0" @@ -897,27 +868,19 @@ const char kOpenSSLReasonStringData[] = "NOT_IMPLEMENTED\0" "RANDOM_NUMBER_GENERATION_FAILED\0" "OPERATION_NOT_SUPPORTED\0" - "BN_DECODE_ERROR\0" "COMMAND_NOT_SUPPORTED\0" - "CONTEXT_NOT_INITIALISED\0" "DIFFERENT_KEY_TYPES\0" "DIFFERENT_PARAMETERS\0" - "DIGEST_AND_KEY_TYPE_NOT_SUPPORTED\0" "EXPECTING_AN_EC_KEY_KEY\0" "EXPECTING_AN_RSA_KEY\0" - "EXPECTING_A_DH_KEY\0" "EXPECTING_A_DSA_KEY\0" "ILLEGAL_OR_UNSUPPORTED_PADDING_MODE\0" - "INVALID_CURVE\0" "INVALID_DIGEST_LENGTH\0" "INVALID_DIGEST_TYPE\0" "INVALID_KEYBITS\0" "INVALID_MGF1_MD\0" "INVALID_PADDING_MODE\0" - "INVALID_PSS_PARAMETERS\0" "INVALID_PSS_SALTLEN\0" - "INVALID_SALT_LENGTH\0" - "INVALID_TRAILER\0" "KEYS_NOT_SET\0" "NO_DEFAULT_DIGEST\0" "NO_KEY_SET\0" @@ -927,17 +890,8 @@ const char kOpenSSLReasonStringData[] = "NO_PARAMETERS_SET\0" "OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE\0" "OPERATON_NOT_INITIALIZED\0" - "PARAMETER_ENCODING_ERROR\0" - "UNKNOWN_DIGEST\0" - "UNKNOWN_MASK_DIGEST\0" - "UNKNOWN_MESSAGE_DIGEST_ALGORITHM\0" "UNKNOWN_PUBLIC_KEY_TYPE\0" - "UNKNOWN_SIGNATURE_ALGORITHM\0" "UNSUPPORTED_ALGORITHM\0" - "UNSUPPORTED_MASK_ALGORITHM\0" - "UNSUPPORTED_MASK_PARAMETER\0" - "UNSUPPORTED_SIGNATURE_TYPE\0" - "WRONG_PUBLIC_KEY_TYPE\0" "OUTPUT_TOO_LARGE\0" "UNKNOWN_NID\0" "BAD_BASE64_DECODE\0" @@ -973,13 +927,13 @@ const char kOpenSSLReasonStringData[] = "UNKNOWN_ALGORITHM\0" "UNKNOWN_CIPHER\0" "UNKNOWN_CIPHER_ALGORITHM\0" + "UNKNOWN_DIGEST\0" "UNKNOWN_HASH\0" "UNSUPPORTED_PRIVATE_KEY_ALGORITHM\0" "BAD_E_VALUE\0" "BAD_FIXED_HEADER_DECRYPT\0" "BAD_PAD_BYTE_COUNT\0" "BAD_RSA_PARAMETERS\0" - "BAD_VERSION\0" "BLOCK_TYPE_IS_NOT_01\0" "BN_NOT_INITIALIZED\0" "CANNOT_RECOVER_MULTI_PRIME_KEY\0" @@ -1024,7 +978,6 @@ const char kOpenSSLReasonStringData[] = "BAD_DIGEST_LENGTH\0" "BAD_ECC_CERT\0" "BAD_ECPOINT\0" - "BAD_HANDSHAKE_LENGTH\0" "BAD_HANDSHAKE_RECORD\0" "BAD_HELLO_REQUEST\0" "BAD_LENGTH\0" @@ -1035,7 +988,6 @@ const char kOpenSSLReasonStringData[] = "BAD_SSL_FILETYPE\0" "BAD_WRITE_RETRY\0" "BIO_NOT_SET\0" - "CANNOT_SERIALIZE_PUBLIC_KEY\0" "CA_DN_LENGTH_MISMATCH\0" "CA_DN_TOO_LONG\0" "CCS_RECEIVED_EARLY\0" @@ -1044,37 +996,30 @@ const char kOpenSSLReasonStringData[] = "CERT_LENGTH_MISMATCH\0" "CHANNEL_ID_NOT_P256\0" "CHANNEL_ID_SIGNATURE_INVALID\0" - "CIPHER_CODE_WRONG_LENGTH\0" "CIPHER_OR_HASH_UNAVAILABLE\0" "CLIENTHELLO_PARSE_FAILED\0" "CLIENTHELLO_TLSEXT\0" "CONNECTION_REJECTED\0" "CONNECTION_TYPE_NOT_SET\0" - "COOKIE_MISMATCH\0" - "D2I_ECDSA_SIG\0" - "DATA_BETWEEN_CCS_AND_FINISHED\0" + "CUSTOM_EXTENSION_ERROR\0" "DATA_LENGTH_TOO_LONG\0" "DECRYPTION_FAILED\0" "DECRYPTION_FAILED_OR_BAD_RECORD_MAC\0" "DH_PUBLIC_VALUE_LENGTH_IS_WRONG\0" + "DH_P_TOO_LONG\0" "DIGEST_CHECK_FAILED\0" "DTLS_MESSAGE_TOO_BIG\0" "ECC_CERT_NOT_FOR_SIGNING\0" - "EMPTY_SRTP_PROTECTION_PROFILE_LIST\0" "EMS_STATE_INCONSISTENT\0" "ENCRYPTED_LENGTH_TOO_LONG\0" + "ERROR_ADDING_EXTENSION\0" "ERROR_IN_RECEIVED_CIPHER_LIST\0" - "EVP_DIGESTSIGNFINAL_FAILED\0" - "EVP_DIGESTSIGNINIT_FAILED\0" + "ERROR_PARSING_EXTENSION\0" "EXCESSIVE_MESSAGE_SIZE\0" "EXTRA_DATA_IN_MESSAGE\0" "FRAGMENT_MISMATCH\0" - "GOT_A_FIN_BEFORE_A_CCS\0" - "GOT_CHANNEL_ID_BEFORE_A_CCS\0" - "GOT_NEXT_PROTO_BEFORE_A_CCS\0" "GOT_NEXT_PROTO_WITHOUT_EXTENSION\0" "HANDSHAKE_FAILURE_ON_CLIENT_HELLO\0" - "HANDSHAKE_RECORD_BEFORE_CCS\0" "HTTPS_PROXY_REQUEST\0" "HTTP_REQUEST\0" "INAPPROPRIATE_FALLBACK\0" @@ -1084,22 +1029,19 @@ const char kOpenSSLReasonStringData[] = "INVALID_TICKET_KEYS_LENGTH\0" "LENGTH_MISMATCH\0" "LIBRARY_HAS_NO_CIPHERS\0" - "MISSING_DH_KEY\0" - "MISSING_ECDSA_SIGNING_CERT\0" + "MISSING_EXTENSION\0" "MISSING_RSA_CERTIFICATE\0" - "MISSING_RSA_ENCRYPTING_CERT\0" - "MISSING_RSA_SIGNING_CERT\0" "MISSING_TMP_DH_KEY\0" "MISSING_TMP_ECDH_KEY\0" "MIXED_SPECIAL_OPERATOR_WITH_GROUPS\0" "MTU_TOO_SMALL\0" + "NEGOTIATED_BOTH_NPN_AND_ALPN\0" "NESTED_GROUP\0" "NO_CERTIFICATES_RETURNED\0" "NO_CERTIFICATE_ASSIGNED\0" "NO_CERTIFICATE_SET\0" "NO_CIPHERS_AVAILABLE\0" "NO_CIPHERS_PASSED\0" - "NO_CIPHERS_SPECIFIED\0" "NO_CIPHER_MATCH\0" "NO_COMPRESSION_SPECIFIED\0" "NO_METHOD_SPECIFIED\0" @@ -1108,13 +1050,10 @@ const char kOpenSSLReasonStringData[] = "NO_RENEGOTIATION\0" "NO_REQUIRED_DIGEST\0" "NO_SHARED_CIPHER\0" - "NO_SHARED_SIGATURE_ALGORITHMS\0" - "NO_SRTP_PROFILES\0" "NULL_SSL_CTX\0" "NULL_SSL_METHOD_PASSED\0" "OLD_SESSION_CIPHER_NOT_RETURNED\0" "OLD_SESSION_VERSION_NOT_RETURNED\0" - "PACKET_LENGTH_TOO_LONG\0" "PARSE_TLSEXT\0" "PATH_TOO_LONG\0" "PEER_DID_NOT_RETURN_A_CERTIFICATE\0" @@ -1123,11 +1062,9 @@ const char kOpenSSLReasonStringData[] = "PSK_IDENTITY_NOT_FOUND\0" "PSK_NO_CLIENT_CB\0" "PSK_NO_SERVER_CB\0" - "READ_BIO_NOT_SET\0" "READ_TIMEOUT_EXPIRED\0" "RECORD_LENGTH_MISMATCH\0" "RECORD_TOO_LARGE\0" - "RENEGOTIATE_EXT_TOO_LONG\0" "RENEGOTIATION_ENCODING_ERR\0" "RENEGOTIATION_MISMATCH\0" "REQUIRED_CIPHER_MISSING\0" @@ -1137,13 +1074,11 @@ const char kOpenSSLReasonStringData[] = "SERVERHELLO_TLSEXT\0" "SESSION_ID_CONTEXT_UNINITIALIZED\0" "SESSION_MAY_NOT_BE_CREATED\0" - "SIGNATURE_ALGORITHMS_ERROR\0" + "SHUTDOWN_WHILE_IN_INIT\0" "SIGNATURE_ALGORITHMS_EXTENSION_SENT_BY_SERVER\0" "SRTP_COULD_NOT_ALLOCATE_PROFILES\0" - "SRTP_PROTECTION_PROFILE_LIST_TOO_LONG\0" "SRTP_UNKNOWN_PROTECTION_PROFILE\0" "SSL3_EXT_INVALID_SERVERNAME\0" - "SSL3_EXT_INVALID_SERVERNAME_TYPE\0" "SSLV3_ALERT_BAD_CERTIFICATE\0" "SSLV3_ALERT_BAD_RECORD_MAC\0" "SSLV3_ALERT_CERTIFICATE_EXPIRED\0" @@ -1158,10 +1093,7 @@ const char kOpenSSLReasonStringData[] = "SSLV3_ALERT_UNSUPPORTED_CERTIFICATE\0" "SSL_CTX_HAS_NO_DEFAULT_SSL_VERSION\0" "SSL_HANDSHAKE_FAILURE\0" - "SSL_SESSION_ID_CALLBACK_FAILED\0" - "SSL_SESSION_ID_CONFLICT\0" "SSL_SESSION_ID_CONTEXT_TOO_LONG\0" - "SSL_SESSION_ID_HAS_BAD_LENGTH\0" "TLSV1_ALERT_ACCESS_DENIED\0" "TLSV1_ALERT_DECODE_ERROR\0" "TLSV1_ALERT_DECRYPTION_FAILED\0" @@ -1180,17 +1112,12 @@ const char kOpenSSLReasonStringData[] = "TLSV1_CERTIFICATE_UNOBTAINABLE\0" "TLSV1_UNRECOGNIZED_NAME\0" "TLSV1_UNSUPPORTED_EXTENSION\0" - "TLS_CLIENT_CERT_REQ_WITH_ANON_CIPHER\0" - "TLS_ILLEGAL_EXPORTER_LABEL\0" - "TLS_INVALID_ECPOINTFORMAT_LIST\0" "TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST\0" "TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG\0" "TOO_MANY_EMPTY_FRAGMENTS\0" "TOO_MANY_WARNING_ALERTS\0" "UNABLE_TO_FIND_ECDH_PARAMETERS\0" - "UNABLE_TO_FIND_PUBLIC_KEY_PARAMETERS\0" "UNEXPECTED_EXTENSION\0" - "UNEXPECTED_GROUP_CLOSE\0" "UNEXPECTED_MESSAGE\0" "UNEXPECTED_OPERATOR_IN_GROUP\0" "UNEXPECTED_RECORD\0" @@ -1202,13 +1129,10 @@ const char kOpenSSLReasonStringData[] = "UNKNOWN_PROTOCOL\0" "UNKNOWN_SSL_VERSION\0" "UNKNOWN_STATE\0" - "UNPROCESSED_HANDSHAKE_DATA\0" "UNSAFE_LEGACY_RENEGOTIATION_DISABLED\0" "UNSUPPORTED_COMPRESSION_ALGORITHM\0" "UNSUPPORTED_ELLIPTIC_CURVE\0" "UNSUPPORTED_PROTOCOL\0" - "UNSUPPORTED_SSL_VERSION\0" - "USE_SRTP_NOT_NEGOTIATED\0" "WRONG_CERTIFICATE_TYPE\0" "WRONG_CIPHER_RETURNED\0" "WRONG_CURVE\0" @@ -1229,12 +1153,14 @@ const char kOpenSSLReasonStringData[] = "IDP_MISMATCH\0" "INVALID_DIRECTORY\0" "INVALID_FIELD_NAME\0" + "INVALID_PSS_PARAMETERS\0" "INVALID_TRUST\0" "ISSUER_MISMATCH\0" "KEY_TYPE_MISMATCH\0" "KEY_VALUES_MISMATCH\0" "LOADING_CERT_DIR\0" "LOADING_DEFAULTS\0" + "NAME_TOO_LONG\0" "NEWER_CRL_NOT_NEWER\0" "NOT_PKCS7_SIGNED_DATA\0" "NO_CERTIFICATES_INCLUDED\0" @@ -1244,8 +1170,6 @@ const char kOpenSSLReasonStringData[] = "PUBLIC_KEY_DECODE_ERROR\0" "PUBLIC_KEY_ENCODE_ERROR\0" "SHOULD_RETRY\0" - "UNABLE_TO_FIND_PARAMETERS_IN_CHAIN\0" - "UNABLE_TO_GET_CERTS_PUBLIC_KEY\0" "UNKNOWN_KEY_TYPE\0" "UNKNOWN_PURPOSE_ID\0" "UNKNOWN_TRUST_ID\0" diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/aes/aesv8-armx64.S b/packager/third_party/boringssl/linux-aarch64/crypto/aes/aesv8-armx64.S index c414476cdf..3e8cb16e01 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/aes/aesv8-armx64.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/aes/aesv8-armx64.S @@ -1,5 +1,5 @@ #if defined(__aarch64__) -#include "arm_arch.h" +#include #if __ARM_MAX_ARCH__>=7 .text @@ -13,6 +13,7 @@ .long 0x1b,0x1b,0x1b,0x1b .globl aes_v8_set_encrypt_key +.hidden aes_v8_set_encrypt_key .type aes_v8_set_encrypt_key,%function .align 5 aes_v8_set_encrypt_key: @@ -180,6 +181,7 @@ aes_v8_set_encrypt_key: .size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key .globl aes_v8_set_decrypt_key +.hidden aes_v8_set_decrypt_key .type aes_v8_set_decrypt_key,%function .align 5 aes_v8_set_decrypt_key: @@ -219,6 +221,7 @@ aes_v8_set_decrypt_key: ret .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key .globl aes_v8_encrypt +.hidden aes_v8_encrypt .type aes_v8_encrypt,%function .align 5 aes_v8_encrypt: @@ -248,6 +251,7 @@ aes_v8_encrypt: ret .size aes_v8_encrypt,.-aes_v8_encrypt .globl aes_v8_decrypt +.hidden aes_v8_decrypt .type aes_v8_decrypt,%function .align 5 aes_v8_decrypt: @@ -277,6 +281,7 @@ aes_v8_decrypt: ret .size aes_v8_decrypt,.-aes_v8_decrypt .globl aes_v8_cbc_encrypt +.hidden aes_v8_cbc_encrypt .type aes_v8_cbc_encrypt,%function .align 5 aes_v8_cbc_encrypt: @@ -567,6 +572,7 @@ aes_v8_cbc_encrypt: ret .size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt .globl aes_v8_ctr32_encrypt_blocks +.hidden aes_v8_ctr32_encrypt_blocks .type aes_v8_ctr32_encrypt_blocks,%function .align 5 aes_v8_ctr32_encrypt_blocks: @@ -748,4 +754,4 @@ aes_v8_ctr32_encrypt_blocks: ret .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks #endif -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/bn/armv8-mont.S b/packager/third_party/boringssl/linux-aarch64/crypto/bn/armv8-mont.S new file mode 100644 index 0000000000..74702db6aa --- /dev/null +++ b/packager/third_party/boringssl/linux-aarch64/crypto/bn/armv8-mont.S @@ -0,0 +1,1407 @@ +#if defined(__aarch64__) +.text + +.globl bn_mul_mont +.hidden bn_mul_mont +.type bn_mul_mont,%function +.align 5 +bn_mul_mont: + tst x5,#7 + b.eq __bn_sqr8x_mont + tst x5,#3 + b.eq __bn_mul4x_mont +.Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,.L1st_skip + +.L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,.L1st + +.L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +.Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,.Linner_skip + +.Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,.Linner + +.Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,.Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +.Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,.Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +.Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,.Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + ret +.size bn_mul_mont,.-bn_mul_mont +.type __bn_sqr8x_mont,%function +.align 5 +__bn_sqr8x_mont: + cmp x1,x2 + b.ne __bn_mul4x_mont +.Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b .Lsqr8x_zero_start + +.Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +.Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,.Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +.Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewinded ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,.Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +.Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,.Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq .Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b .Lsqr8x_mul + +.align 4 +.Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,.Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b .Lsqr8x_outer_loop + +.align 4 +.Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +.Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,.Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +.Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,.Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,.Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +.Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,.Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewinded np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,.Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b .Lsqr8x_tail + +.align 4 +.Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne .Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +.Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,.Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +.Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,.Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b .Lsqr8x_done + +.align 4 +.Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +.Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret +.size __bn_sqr8x_mont,.-__bn_sqr8x_mont +.type __bn_mul4x_mont,%function +.align 5 +__bn_mul4x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +.Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_1st_reduction + + cbz x10,.Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewinded x1 + cbz x10,.Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b .Loop_mul4x_1st_tail + +.align 5 +.Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +.Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +.Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_tail + + sub x11,x3,x5 // rewinded np? + adc x0,x0,xzr + cbz x10,.Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b .Loop_mul4x_tail + +.align 4 +.Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq .Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b .Loop_mul4x_reduction + +.align 4 +.Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +.Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,.Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +.Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,.Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b .Lmul4x_done + +.align 4 +.Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +.Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + mov x0,#1 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + ret +.size __bn_mul4x_mont,.-__bn_mul4x_mont +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S new file mode 100644 index 0000000000..6ff6bffb66 --- /dev/null +++ b/packager/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S @@ -0,0 +1,1971 @@ +#if defined(__aarch64__) +#include + +.text + + + +.align 5 +.Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +.Lone: +.long 1,0,0,0 +.LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else +.quad OPENSSL_armcap_P-. +#endif +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.globl ChaCha20_ctr32 +.hidden ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: + cbz x2,.Labort + adr x5,.LOPENSSL_armcap_P + cmp x2,#192 + b.lo .Lshort +#ifdef __ILP32__ + ldrsw x6,[x5] +#else + ldr x6,[x5] +#endif + ldr w17,[x6,x5] + tst w17,#ARMV7_NEON + b.ne ChaCha20_neon + +.Lshort: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr x5,.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __ARMEB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +.Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +.Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,.Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo .Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi .Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +.Labort: + ret + +.align 4 +.Ltail: + add x2,x2,#64 +.Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +.Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,.Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_ctr32,.-ChaCha20_ctr32 + +.type ChaCha20_neon,%function +.align 5 +ChaCha20_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr x5,.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs .L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __ARMEB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +.Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +.Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,.Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo .Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi .Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret + +.Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo .Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq .Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo .Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq .Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo .Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq .Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b .Last_neon + +.Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b .Last_neon +.Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b .Last_neon + +.align 4 +.Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +.Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,.Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +.Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_neon,.-ChaCha20_neon +.type ChaCha20_512_neon,%function +.align 5 +ChaCha20_512_neon: + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adr x5,.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +.L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __ARMEB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +.Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +.Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,.Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +.Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,.Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __ARMEB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs .Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq .Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs .Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b .Loop_outer + +.Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + ret +.size ChaCha20_512_neon,.-ChaCha20_512_neon +#endif diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/modes/ghashv8-armx64.S b/packager/third_party/boringssl/linux-aarch64/crypto/modes/ghashv8-armx64.S index a0a9b6807a..f39f3ba870 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/modes/ghashv8-armx64.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/modes/ghashv8-armx64.S @@ -1,11 +1,12 @@ #if defined(__aarch64__) -#include "arm_arch.h" +#include .text #if !defined(__clang__) .arch armv8-a+crypto #endif .globl gcm_init_v8 +.hidden gcm_init_v8 .type gcm_init_v8,%function .align 4 gcm_init_v8: @@ -56,6 +57,7 @@ gcm_init_v8: ret .size gcm_init_v8,.-gcm_init_v8 .globl gcm_gmult_v8 +.hidden gcm_gmult_v8 .type gcm_gmult_v8,%function .align 4 gcm_gmult_v8: @@ -68,10 +70,10 @@ gcm_gmult_v8: #endif ext v3.16b,v17.16b,v17.16b,#8 - pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b @@ -97,6 +99,7 @@ gcm_gmult_v8: ret .size gcm_gmult_v8,.-gcm_gmult_v8 .globl gcm_ghash_v8 +.hidden gcm_ghash_v8 .type gcm_ghash_v8,%function .align 4 gcm_ghash_v8: @@ -135,7 +138,7 @@ gcm_ghash_v8: #endif ext v7.16b,v17.16b,v17.16b,#8 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi - pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing pmull2 v6.1q,v20.2d,v7.2d b .Loop_mod2x_v8 @@ -144,14 +147,14 @@ gcm_ghash_v8: .Loop_mod2x_v8: ext v18.16b,v3.16b,v3.16b,#8 subs x3,x3,#32 //is there more data? - pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo csel x12,xzr,x12,lo //is it time to zero x12? pmull v5.1q,v21.1d,v17.1d eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi eor v0.16b,v0.16b,v4.16b //accumulate - pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] eor v2.16b,v2.16b,v6.16b @@ -176,7 +179,7 @@ gcm_ghash_v8: ext v7.16b,v17.16b,v17.16b,#8 ext v3.16b,v16.16b,v16.16b,#8 eor v0.16b,v1.16b,v18.16b - pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction @@ -197,10 +200,10 @@ gcm_ghash_v8: eor v3.16b,v3.16b,v0.16b //inp^=Xi eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi - pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing eor v18.16b,v0.16b,v2.16b @@ -229,4 +232,4 @@ gcm_ghash_v8: .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha1-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha1-armv8.S index 487a497da2..cfb4aa021f 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha1-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha1-armv8.S @@ -1,10 +1,11 @@ #if defined(__aarch64__) -#include "arm_arch.h" +#include .text .globl sha1_block_data_order +.hidden sha1_block_data_order .type sha1_block_data_order,%function .align 6 sha1_block_data_order: @@ -1212,4 +1213,4 @@ sha1_block_armv8: .align 2 .align 2 .comm OPENSSL_armcap_P,4,4 -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha256-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha256-armv8.S index 4834553dc1..bfc552cbf1 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha256-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha256-armv8.S @@ -1,10 +1,11 @@ #if defined(__aarch64__) -#include "arm_arch.h" +#include .text .globl sha256_block_data_order +.hidden sha256_block_data_order .type sha256_block_data_order,%function .align 6 sha256_block_data_order: @@ -1142,4 +1143,4 @@ sha256_block_armv8: ret .size sha256_block_armv8,.-sha256_block_armv8 .comm OPENSSL_armcap_P,4,4 -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha512-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha512-armv8.S index 654c473f27..4645722923 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha512-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha512-armv8.S @@ -1,10 +1,11 @@ #if defined(__aarch64__) -#include "arm_arch.h" +#include .text .globl sha512_block_data_order +.hidden sha512_block_data_order .type sha512_block_data_order,%function .align 6 sha512_block_data_order: @@ -1022,4 +1023,4 @@ sha512_block_data_order: .align 2 .align 2 .comm OPENSSL_armcap_P,4,4 -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/aes/aes-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/aes/aes-armv4.S index cb94841e02..bc11e3f139 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/aes/aes-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/aes/aes-armv4.S @@ -34,7 +34,7 @@ #if defined(__arm__) #ifndef __KERNEL__ -# include "arm_arch.h" +# include #else # define __ARM_ARCH__ __LINUX_ARM_ARCH__ #endif @@ -1197,4 +1197,4 @@ _armv4_AES_decrypt: .align 2 #endif -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/aes/aesv8-armx32.S b/packager/third_party/boringssl/linux-arm/crypto/aes/aesv8-armx32.S index 6f0ee7d81d..95a2ea4dc9 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/aes/aesv8-armx32.S +++ b/packager/third_party/boringssl/linux-arm/crypto/aes/aesv8-armx32.S @@ -1,5 +1,5 @@ #if defined(__arm__) -#include "arm_arch.h" +#include #if __ARM_MAX_ARCH__>=7 .text @@ -13,6 +13,7 @@ .long 0x1b,0x1b,0x1b,0x1b .globl aes_v8_set_encrypt_key +.hidden aes_v8_set_encrypt_key .type aes_v8_set_encrypt_key,%function .align 5 aes_v8_set_encrypt_key: @@ -183,6 +184,7 @@ aes_v8_set_encrypt_key: .size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key .globl aes_v8_set_decrypt_key +.hidden aes_v8_set_decrypt_key .type aes_v8_set_decrypt_key,%function .align 5 aes_v8_set_decrypt_key: @@ -220,6 +222,7 @@ aes_v8_set_decrypt_key: ldmia sp!,{r4,pc} .size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key .globl aes_v8_encrypt +.hidden aes_v8_encrypt .type aes_v8_encrypt,%function .align 5 aes_v8_encrypt: @@ -249,6 +252,7 @@ aes_v8_encrypt: bx lr .size aes_v8_encrypt,.-aes_v8_encrypt .globl aes_v8_decrypt +.hidden aes_v8_decrypt .type aes_v8_decrypt,%function .align 5 aes_v8_decrypt: @@ -278,6 +282,7 @@ aes_v8_decrypt: bx lr .size aes_v8_decrypt,.-aes_v8_decrypt .globl aes_v8_cbc_encrypt +.hidden aes_v8_cbc_encrypt .type aes_v8_cbc_encrypt,%function .align 5 aes_v8_cbc_encrypt: @@ -570,6 +575,7 @@ aes_v8_cbc_encrypt: ldmia sp!,{r4,r5,r6,r7,r8,pc} .size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt .globl aes_v8_ctr32_encrypt_blocks +.hidden aes_v8_ctr32_encrypt_blocks .type aes_v8_ctr32_encrypt_blocks,%function .align 5 aes_v8_ctr32_encrypt_blocks: @@ -753,4 +759,4 @@ aes_v8_ctr32_encrypt_blocks: ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} .size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks #endif -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/aes/bsaes-armv7.S b/packager/third_party/boringssl/linux-arm/crypto/aes/bsaes-armv7.S index dd84f3589d..abb414d549 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/aes/bsaes-armv7.S +++ b/packager/third_party/boringssl/linux-arm/crypto/aes/bsaes-armv7.S @@ -47,9 +47,8 @@ @ @ -#if defined(__arm__) #ifndef __KERNEL__ -# include "arm_arch.h" +# include # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} # define VFP_ABI_POP vldmia sp!,{d8-d15} @@ -2576,4 +2575,3 @@ bsaes_xts_decrypt: .size bsaes_xts_decrypt,.-bsaes_xts_decrypt #endif #endif -#endif \ No newline at end of file diff --git a/packager/third_party/boringssl/linux-arm/crypto/bn/armv4-mont.S b/packager/third_party/boringssl/linux-arm/crypto/bn/armv4-mont.S index 68dfb2c172..e59599f832 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/bn/armv4-mont.S +++ b/packager/third_party/boringssl/linux-arm/crypto/bn/armv4-mont.S @@ -1,5 +1,5 @@ #if defined(__arm__) -#include "arm_arch.h" +#include .text .code 32 @@ -28,7 +28,7 @@ bn_mul_mont: #ifdef __APPLE__ ldr r0,[r0] #endif - tst r0,#1 @ NEON available? + tst r0,#ARMV7_NEON @ NEON available? ldmia sp, {r0,r2} beq .Lialu add sp,sp,#8 @@ -586,4 +586,4 @@ bn_mul8x_mont_neon: .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P #endif -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S new file mode 100644 index 0000000000..19a4d2c4ff --- /dev/null +++ b/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S @@ -0,0 +1,1471 @@ +#if defined(__arm__) +#include + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +#if defined(__thumb2__) || defined(__clang__) +#define ldrhsb ldrbhs +#endif + +.align 5 +.Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +.Lone: +.long 1,0,0,0 +#if __ARM_MAX_ARCH__>=7 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.LChaCha20_ctr32 +#else +.word -1 +#endif + +.globl ChaCha20_ctr32 +.hidden ChaCha20_ctr32 +.type ChaCha20_ctr32,%function +.align 5 +ChaCha20_ctr32: +.LChaCha20_ctr32: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0,r1,r2,r4-r11,lr} +#if __ARM_ARCH__<7 && !defined(__thumb2__) + sub r14,pc,#16 @ ChaCha20_ctr32 +#else + adr r14,.LChaCha20_ctr32 +#endif + cmp r2,#0 @ len==0? +#ifdef __thumb2__ + itt eq +#endif + addeq sp,sp,#4*3 + beq .Lno_data +#if __ARM_MAX_ARCH__>=7 + cmp r2,#192 @ test len + bls .Lshort + ldr r4,[r14,#-32] + ldr r4,[r14,r4] +# ifdef __APPLE__ + ldr r4,[r4] +# endif + tst r4,#ARMV7_NEON + bne .LChaCha20_neon +.Lshort: +#endif + ldmia r12,{r4,r5,r6,r7} @ load counter and nonce + sub sp,sp,#4*(16) @ off-load area + sub r14,r14,#64 @ .Lsigma + stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce + ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key + ldmia r14,{r0,r1,r2,r3} @ load sigma + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key + stmdb sp!,{r0,r1,r2,r3} @ copy sigma + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + b .Loop_outer_enter + +.align 4 +.Loop_outer: + ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material + str r11,[sp,#4*(32+2)] @ save len + str r12, [sp,#4*(32+1)] @ save inp + str r14, [sp,#4*(32+0)] @ save out +.Loop_outer_enter: + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(16+15)] + mov r11,#10 + b .Loop + +.align 4 +.Loop: + subs r11,r11,#1 + add r0,r0,r4 + mov r12,r12,ror#16 + add r1,r1,r5 + mov r10,r10,ror#16 + eor r12,r12,r0,ror#16 + eor r10,r10,r1,ror#16 + add r8,r8,r12 + mov r4,r4,ror#20 + add r9,r9,r10 + mov r5,r5,ror#20 + eor r4,r4,r8,ror#20 + eor r5,r5,r9,ror#20 + add r0,r0,r4 + mov r12,r12,ror#24 + add r1,r1,r5 + mov r10,r10,ror#24 + eor r12,r12,r0,ror#24 + eor r10,r10,r1,ror#24 + add r8,r8,r12 + mov r4,r4,ror#25 + add r9,r9,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+13)] + ldr r10,[sp,#4*(16+15)] + eor r4,r4,r8,ror#25 + eor r5,r5,r9,ror#25 + str r8,[sp,#4*(16+8)] + ldr r8,[sp,#4*(16+10)] + add r2,r2,r6 + mov r14,r14,ror#16 + str r9,[sp,#4*(16+9)] + ldr r9,[sp,#4*(16+11)] + add r3,r3,r7 + mov r10,r10,ror#16 + eor r14,r14,r2,ror#16 + eor r10,r10,r3,ror#16 + add r8,r8,r14 + mov r6,r6,ror#20 + add r9,r9,r10 + mov r7,r7,ror#20 + eor r6,r6,r8,ror#20 + eor r7,r7,r9,ror#20 + add r2,r2,r6 + mov r14,r14,ror#24 + add r3,r3,r7 + mov r10,r10,ror#24 + eor r14,r14,r2,ror#24 + eor r10,r10,r3,ror#24 + add r8,r8,r14 + mov r6,r6,ror#25 + add r9,r9,r10 + mov r7,r7,ror#25 + eor r6,r6,r8,ror#25 + eor r7,r7,r9,ror#25 + add r0,r0,r5 + mov r10,r10,ror#16 + add r1,r1,r6 + mov r12,r12,ror#16 + eor r10,r10,r0,ror#16 + eor r12,r12,r1,ror#16 + add r8,r8,r10 + mov r5,r5,ror#20 + add r9,r9,r12 + mov r6,r6,ror#20 + eor r5,r5,r8,ror#20 + eor r6,r6,r9,ror#20 + add r0,r0,r5 + mov r10,r10,ror#24 + add r1,r1,r6 + mov r12,r12,ror#24 + eor r10,r10,r0,ror#24 + eor r12,r12,r1,ror#24 + add r8,r8,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+15)] + ldr r10,[sp,#4*(16+13)] + add r9,r9,r12 + mov r6,r6,ror#25 + eor r5,r5,r8,ror#25 + eor r6,r6,r9,ror#25 + str r8,[sp,#4*(16+10)] + ldr r8,[sp,#4*(16+8)] + add r2,r2,r7 + mov r10,r10,ror#16 + str r9,[sp,#4*(16+11)] + ldr r9,[sp,#4*(16+9)] + add r3,r3,r4 + mov r14,r14,ror#16 + eor r10,r10,r2,ror#16 + eor r14,r14,r3,ror#16 + add r8,r8,r10 + mov r7,r7,ror#20 + add r9,r9,r14 + mov r4,r4,ror#20 + eor r7,r7,r8,ror#20 + eor r4,r4,r9,ror#20 + add r2,r2,r7 + mov r10,r10,ror#24 + add r3,r3,r4 + mov r14,r14,ror#24 + eor r10,r10,r2,ror#24 + eor r14,r14,r3,ror#24 + add r8,r8,r10 + mov r7,r7,ror#25 + add r9,r9,r14 + mov r4,r4,ror#25 + eor r7,r7,r8,ror#25 + eor r4,r4,r9,ror#25 + bne .Loop + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + cmp r11,#64 @ done yet? +#ifdef __thumb2__ + itete lo +#endif + addlo r12,sp,#4*(0) @ shortcut or ... + ldrhs r12,[sp,#4*(32+1)] @ ... load inp + addlo r14,sp,#4*(0) @ shortcut or ... + ldrhs r14,[sp,#4*(32+0)] @ ... load out + + ldr r8,[sp,#4*(0)] @ load key material + ldr r9,[sp,#4*(1)] + +#if __ARM_ARCH__>=6 || !defined(__ARMEB__) +# if __ARM_ARCH__<7 + orr r10,r12,r14 + tst r10,#3 @ are input and output aligned? + ldr r10,[sp,#4*(2)] + bne .Lunaligned + cmp r11,#64 @ restore flags +# else + ldr r10,[sp,#4*(2)] +# endif + ldr r11,[sp,#4*(3)] + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + + add r2,r2,r10 + add r3,r3,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r0,r0,r8 @ xor with input + eorhs r1,r1,r9 + add r8,sp,#4*(4) + str r0,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r1,[r14,#-12] + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 + add r8,sp,#4*(8) + str r4,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r5,[r14,#-12] + ldmia r8,{r8,r9,r10,r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] +# ifdef __thumb2__ + itt hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r2,r2,r10 + add r3,r3,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r0,r0,r8 + eorhs r1,r1,r9 + add r8,sp,#4*(12) + str r0,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + str r1,[r14,#-12] + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +# ifdef __thumb2__ + itt hi +# endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH__>=6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 +# ifdef __thumb2__ + it ne +# endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + itt hs +# endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r4,[r14],#16 @ store output + str r5,[r14,#-12] +# ifdef __thumb2__ + it hs +# endif + subhs r11,r8,#64 @ len-=64 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi .Loop_outer + + beq .Ldone +# if __ARM_ARCH__<7 + b .Ltail + +.align 4 +.Lunaligned:@ unaligned endian-neutral path + cmp r11,#64 @ restore flags +# endif +#endif +#if __ARM_ARCH__<7 + ldr r11,[sp,#4*(3)] + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r3,r3,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+0) + ldmia r8,{r8,r9,r10,r11} @ load key material + add r0,sp,#4*(16+8) + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 + add r6,r6,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r7,r7,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] + add r8,sp,#4*(4+4) + ldmia r8,{r8,r9,r10,r11} @ load key material + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half +# ifdef __thumb2__ + itt hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" + strhi r11,[sp,#4*(16+11)] @ copy "rx" + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r3,r3,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+8) + ldmia r8,{r8,r9,r10,r11} @ load key material + add r4,r4,r8 @ accumulate key material +# ifdef __thumb2__ + itt hi +# endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value + add r5,r5,r9 + add r6,r6,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r7,r7,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] +# ifdef __thumb2__ + it ne +# endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + it hs +# endif + subhs r11,r8,#64 @ len-=64 + bhi .Loop_outer + + beq .Ldone +#endif + +.Ltail: + ldr r12,[sp,#4*(32+1)] @ load inp + add r9,sp,#4*(0) + ldr r14,[sp,#4*(32+0)] @ load out + +.Loop_tail: + ldrb r10,[r9],#1 @ read buffer on stack + ldrb r11,[r12],#1 @ read input + subs r8,r8,#1 + eor r11,r11,r10 + strb r11,[r14],#1 @ store output + bne .Loop_tail + +.Ldone: + add sp,sp,#4*(32+3) +.Lno_data: + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +.size ChaCha20_ctr32,.-ChaCha20_ctr32 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.type ChaCha20_neon,%function +.align 5 +ChaCha20_neon: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0,r1,r2,r4-r11,lr} +.LChaCha20_neon: + adr r14,.Lsigma + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so + stmdb sp!,{r0,r1,r2,r3} + + vld1.32 {q1,q2},[r3] @ load key + ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key + + sub sp,sp,#4*(16+16) + vld1.32 {q3},[r12] @ load counter and nonce + add r12,sp,#4*8 + ldmia r14,{r0,r1,r2,r3} @ load sigma + vld1.32 {q0},[r14]! @ load sigma + vld1.32 {q12},[r14] @ one + vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce + vst1.32 {q0,q1},[sp] @ copy sigma|1/2key + + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + vshl.i32 d26,d24,#1 @ two + vstr d24,[sp,#4*(16+0)] + vshl.i32 d28,d24,#2 @ four + vstr d26,[sp,#4*(16+2)] + vmov q4,q0 + vstr d28,[sp,#4*(16+4)] + vmov q8,q0 + vmov q5,q1 + vmov q9,q1 + b .Loop_neon_enter + +.align 4 +.Loop_neon_outer: + ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material + cmp r11,#64*2 @ if len<=64*2 + bls .Lbreak_neon @ switch to integer-only + vmov q4,q0 + str r11,[sp,#4*(32+2)] @ save len + vmov q8,q0 + str r12, [sp,#4*(32+1)] @ save inp + vmov q5,q1 + str r14, [sp,#4*(32+0)] @ save out + vmov q9,q1 +.Loop_neon_enter: + ldr r11, [sp,#4*(15)] + vadd.i32 q7,q3,q12 @ counter+1 + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + vmov q6,q2 + ldr r10, [sp,#4*(13)] + vmov q10,q2 + ldr r14,[sp,#4*(14)] + vadd.i32 q11,q7,q12 @ counter+2 + str r11, [sp,#4*(16+15)] + mov r11,#10 + add r12,r12,#3 @ counter+3 + b .Loop_neon + +.align 4 +.Loop_neon: + subs r11,r11,#1 + vadd.i32 q0,q0,q1 + add r0,r0,r4 + vadd.i32 q4,q4,q5 + mov r12,r12,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r5 + veor q3,q3,q0 + mov r10,r10,ror#16 + veor q7,q7,q4 + eor r12,r12,r0,ror#16 + veor q11,q11,q8 + eor r10,r10,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r12 + vrev32.16 q7,q7 + mov r4,r4,ror#20 + vrev32.16 q11,q11 + add r9,r9,r10 + vadd.i32 q2,q2,q3 + mov r5,r5,ror#20 + vadd.i32 q6,q6,q7 + eor r4,r4,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r5,r5,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r4 + veor q13,q5,q6 + mov r12,r12,ror#24 + veor q14,q9,q10 + add r1,r1,r5 + vshr.u32 q1,q12,#20 + mov r10,r10,ror#24 + vshr.u32 q5,q13,#20 + eor r12,r12,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r10,r10,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r12 + vsli.32 q5,q13,#12 + mov r4,r4,ror#25 + vsli.32 q9,q14,#12 + add r9,r9,r10 + vadd.i32 q0,q0,q1 + mov r5,r5,ror#25 + vadd.i32 q4,q4,q5 + str r10,[sp,#4*(16+13)] + vadd.i32 q8,q8,q9 + ldr r10,[sp,#4*(16+15)] + veor q12,q3,q0 + eor r4,r4,r8,ror#25 + veor q13,q7,q4 + eor r5,r5,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+8)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+10)] + vshr.u32 q7,q13,#24 + add r2,r2,r6 + vshr.u32 q11,q14,#24 + mov r14,r14,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+9)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+11)] + vsli.32 q11,q14,#8 + add r3,r3,r7 + vadd.i32 q2,q2,q3 + mov r10,r10,ror#16 + vadd.i32 q6,q6,q7 + eor r14,r14,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r10,r10,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r14 + veor q13,q5,q6 + mov r6,r6,ror#20 + veor q14,q9,q10 + add r9,r9,r10 + vshr.u32 q1,q12,#25 + mov r7,r7,ror#20 + vshr.u32 q5,q13,#25 + eor r6,r6,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r7,r7,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r6 + vsli.32 q5,q13,#7 + mov r14,r14,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r7 + vext.8 q2,q2,q2,#8 + mov r10,r10,ror#24 + vext.8 q6,q6,q6,#8 + eor r14,r14,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r10,r10,r3,ror#24 + vext.8 q1,q1,q1,#4 + add r8,r8,r14 + vext.8 q5,q5,q5,#4 + mov r6,r6,ror#25 + vext.8 q9,q9,q9,#4 + add r9,r9,r10 + vext.8 q3,q3,q3,#12 + mov r7,r7,ror#25 + vext.8 q7,q7,q7,#12 + eor r6,r6,r8,ror#25 + vext.8 q11,q11,q11,#12 + eor r7,r7,r9,ror#25 + vadd.i32 q0,q0,q1 + add r0,r0,r5 + vadd.i32 q4,q4,q5 + mov r10,r10,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r6 + veor q3,q3,q0 + mov r12,r12,ror#16 + veor q7,q7,q4 + eor r10,r10,r0,ror#16 + veor q11,q11,q8 + eor r12,r12,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r10 + vrev32.16 q7,q7 + mov r5,r5,ror#20 + vrev32.16 q11,q11 + add r9,r9,r12 + vadd.i32 q2,q2,q3 + mov r6,r6,ror#20 + vadd.i32 q6,q6,q7 + eor r5,r5,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r6,r6,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r5 + veor q13,q5,q6 + mov r10,r10,ror#24 + veor q14,q9,q10 + add r1,r1,r6 + vshr.u32 q1,q12,#20 + mov r12,r12,ror#24 + vshr.u32 q5,q13,#20 + eor r10,r10,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r12,r12,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r10 + vsli.32 q5,q13,#12 + mov r5,r5,ror#25 + vsli.32 q9,q14,#12 + str r10,[sp,#4*(16+15)] + vadd.i32 q0,q0,q1 + ldr r10,[sp,#4*(16+13)] + vadd.i32 q4,q4,q5 + add r9,r9,r12 + vadd.i32 q8,q8,q9 + mov r6,r6,ror#25 + veor q12,q3,q0 + eor r5,r5,r8,ror#25 + veor q13,q7,q4 + eor r6,r6,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+10)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+8)] + vshr.u32 q7,q13,#24 + add r2,r2,r7 + vshr.u32 q11,q14,#24 + mov r10,r10,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+11)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+9)] + vsli.32 q11,q14,#8 + add r3,r3,r4 + vadd.i32 q2,q2,q3 + mov r14,r14,ror#16 + vadd.i32 q6,q6,q7 + eor r10,r10,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r14,r14,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r10 + veor q13,q5,q6 + mov r7,r7,ror#20 + veor q14,q9,q10 + add r9,r9,r14 + vshr.u32 q1,q12,#25 + mov r4,r4,ror#20 + vshr.u32 q5,q13,#25 + eor r7,r7,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r4,r4,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r7 + vsli.32 q5,q13,#7 + mov r10,r10,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r4 + vext.8 q2,q2,q2,#8 + mov r14,r14,ror#24 + vext.8 q6,q6,q6,#8 + eor r10,r10,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r14,r14,r3,ror#24 + vext.8 q1,q1,q1,#12 + add r8,r8,r10 + vext.8 q5,q5,q5,#12 + mov r7,r7,ror#25 + vext.8 q9,q9,q9,#12 + add r9,r9,r14 + vext.8 q3,q3,q3,#4 + mov r4,r4,ror#25 + vext.8 q7,q7,q7,#4 + eor r7,r7,r8,ror#25 + vext.8 q11,q11,q11,#4 + eor r4,r4,r9,ror#25 + bne .Loop_neon + + add r11,sp,#32 + vld1.32 {q12,q13},[sp] @ load key material + vld1.32 {q14,q15},[r11] + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + ldr r12,[sp,#4*(32+1)] @ load inp + ldr r14,[sp,#4*(32+0)] @ load out + + vadd.i32 q0,q0,q12 @ accumulate key material + vadd.i32 q4,q4,q12 + vadd.i32 q8,q8,q12 + vldr d24,[sp,#4*(16+0)] @ one + + vadd.i32 q1,q1,q13 + vadd.i32 q5,q5,q13 + vadd.i32 q9,q9,q13 + vldr d26,[sp,#4*(16+2)] @ two + + vadd.i32 q2,q2,q14 + vadd.i32 q6,q6,q14 + vadd.i32 q10,q10,q14 + vadd.i32 d14,d14,d24 @ counter+1 + vadd.i32 d22,d22,d26 @ counter+2 + + vadd.i32 q3,q3,q15 + vadd.i32 q7,q7,q15 + vadd.i32 q11,q11,q15 + + cmp r11,#64*4 + blo .Ltail_neon + + vld1.8 {q12,q13},[r12]! @ load input + mov r11,sp + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 @ xor with input + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + vst1.8 {q0,q1},[r14]! @ store output + veor q5,q5,q13 + vld1.8 {q12,q13},[r12]! + veor q6,q6,q14 + vst1.8 {q2,q3},[r14]! + veor q7,q7,q15 + vld1.8 {q14,q15},[r12]! + + veor q8,q8,q12 + vld1.32 {q0,q1},[r11]! @ load for next iteration + veor d25,d25,d25 + vldr d24,[sp,#4*(16+4)] @ four + veor q9,q9,q13 + vld1.32 {q2,q3},[r11] + veor q10,q10,q14 + vst1.8 {q4,q5},[r14]! + veor q11,q11,q15 + vst1.8 {q6,q7},[r14]! + + vadd.i32 d6,d6,d24 @ next counter value + vldr d24,[sp,#4*(16+0)] @ one + + ldmia sp,{r8,r9,r10,r11} @ load key material + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + vst1.8 {q8,q9},[r14]! + add r1,r1,r9 + ldr r9,[r12,#-12] + vst1.8 {q10,q11},[r14]! + add r2,r2,r10 + ldr r10,[r12,#-8] + add r3,r3,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + eor r0,r0,r8 @ xor with input + add r8,sp,#4*(4) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r5,r5,r9 + ldr r9,[r12,#-12] + add r6,r6,r10 + ldr r10,[r12,#-8] + add r7,r7,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + eor r4,r4,r8 + add r8,sp,#4*(8) + eor r5,r5,r9 + str r4,[r14],#16 @ store output + eor r6,r6,r10 + str r5,[r14,#-12] + eor r7,r7,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r1,r1,r9 + ldr r9,[r12,#-12] +# ifdef __thumb2__ + it hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + add r2,r2,r10 + ldr r10,[r12,#-8] +# ifdef __thumb2__ + it hi +# endif + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r3,r3,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + eor r0,r0,r8 + add r8,sp,#4*(12) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r8,r8,#4 @ next counter value + add r5,r5,r9 + str r8,[sp,#4*(12)] @ save next counter value + ldr r8,[r12],#16 @ load input + add r6,r6,r10 + add r4,r4,#3 @ counter+3 + ldr r9,[r12,#-12] + add r7,r7,r11 + ldr r10,[r12,#-8] + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + eor r4,r4,r8 +# ifdef __thumb2__ + it hi +# endif + ldrhi r8,[sp,#4*(32+2)] @ re-load len + eor r5,r5,r9 + eor r6,r6,r10 + str r4,[r14],#16 @ store output + eor r7,r7,r11 + str r5,[r14,#-12] + sub r11,r8,#64*4 @ len-=64*4 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi .Loop_neon_outer + + b .Ldone_neon + +.align 4 +.Lbreak_neon: + @ harmonize NEON and integer-only stack frames: load data + @ from NEON frame, but save to integer-only one; distance + @ between the two is 4*(32+4+16-32)=4*(20). + + str r11, [sp,#4*(20+32+2)] @ save len + add r11,sp,#4*(32+4) + str r12, [sp,#4*(20+32+1)] @ save inp + str r14, [sp,#4*(20+32+0)] @ save out + + ldr r12,[sp,#4*(16+10)] + ldr r14,[sp,#4*(16+11)] + vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement + str r12,[sp,#4*(20+16+10)] @ copy "rx" + str r14,[sp,#4*(20+16+11)] @ copy "rx" + + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(20+16+15)] + add r11,sp,#4*(20) + vst1.32 {q0,q1},[r11]! @ copy key + add sp,sp,#4*(20) @ switch frame + vst1.32 {q2,q3},[r11] + mov r11,#10 + b .Loop @ go integer-only + +.align 4 +.Ltail_neon: + cmp r11,#64*3 + bhs .L192_or_more_neon + cmp r11,#64*2 + bhs .L128_or_more_neon + cmp r11,#64*1 + bhs .L64_or_more_neon + + add r8,sp,#4*(8) + vst1.8 {q0,q1},[sp] + add r10,sp,#4*(0) + vst1.8 {q2,q3},[r8] + b .Loop_tail_neon + +.align 4 +.L64_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + veor q2,q2,q14 + veor q3,q3,q15 + vst1.8 {q0,q1},[r14]! + vst1.8 {q2,q3},[r14]! + + beq .Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q4,q5},[sp] + add r10,sp,#4*(0) + vst1.8 {q6,q7},[r8] + sub r11,r11,#64*1 @ len-=64*1 + b .Loop_tail_neon + +.align 4 +.L128_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vst1.8 {q0,q1},[r14]! + veor q6,q6,q14 + vst1.8 {q2,q3},[r14]! + veor q7,q7,q15 + vst1.8 {q4,q5},[r14]! + vst1.8 {q6,q7},[r14]! + + beq .Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q8,q9},[sp] + add r10,sp,#4*(0) + vst1.8 {q10,q11},[r8] + sub r11,r11,#64*2 @ len-=64*2 + b .Loop_tail_neon + +.align 4 +.L192_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vld1.8 {q12,q13},[r12]! + veor q6,q6,q14 + vst1.8 {q0,q1},[r14]! + veor q7,q7,q15 + vld1.8 {q14,q15},[r12]! + + veor q8,q8,q12 + vst1.8 {q2,q3},[r14]! + veor q9,q9,q13 + vst1.8 {q4,q5},[r14]! + veor q10,q10,q14 + vst1.8 {q6,q7},[r14]! + veor q11,q11,q15 + vst1.8 {q8,q9},[r14]! + vst1.8 {q10,q11},[r14]! + + beq .Ldone_neon + + ldmia sp,{r8,r9,r10,r11} @ load key material + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(4) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r6,r6,r10 + add r7,r7,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7} + add r0,sp,#4*(16+8) + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(12) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r4,r4,#3 @ counter+3 + add r6,r6,r10 + add r7,r7,r11 + ldr r11,[sp,#4*(32+2)] @ re-load len +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7} + add r10,sp,#4*(0) + sub r11,r11,#64*3 @ len-=64*3 + +.Loop_tail_neon: + ldrb r8,[r10],#1 @ read buffer on stack + ldrb r9,[r12],#1 @ read input + subs r11,r11,#1 + eor r8,r8,r9 + strb r8,[r14],#1 @ store ouput + bne .Loop_tail_neon + +.Ldone_neon: + add sp,sp,#4*(32+4) + vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15} + add sp,sp,#4*(16+3) + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +.size ChaCha20_neon,.-ChaCha20_neon +.comm OPENSSL_armcap_P,4,4 +#endif +#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/modes/ghash-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/modes/ghash-armv4.S index c6f025d759..791b28906c 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/modes/ghash-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/modes/ghash-armv4.S @@ -1,13 +1,12 @@ #if defined(__arm__) -#if defined(__arm__) -#include "arm_arch.h" +#include .syntax unified .text .code 32 -#ifdef __APPLE__ +#ifdef __clang__ #define ldrplb ldrbpl #define ldrneb ldrbne #endif @@ -536,6 +535,4 @@ gcm_ghash_neon: .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 - #endif -#endif \ No newline at end of file diff --git a/packager/third_party/boringssl/linux-arm/crypto/modes/ghashv8-armx32.S b/packager/third_party/boringssl/linux-arm/crypto/modes/ghashv8-armx32.S index bdbbae90d4..0e1e631486 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/modes/ghashv8-armx32.S +++ b/packager/third_party/boringssl/linux-arm/crypto/modes/ghashv8-armx32.S @@ -1,10 +1,11 @@ #if defined(__arm__) -#include "arm_arch.h" +#include .text .fpu neon .code 32 .globl gcm_init_v8 +.hidden gcm_init_v8 .type gcm_init_v8,%function .align 4 gcm_init_v8: @@ -55,6 +56,7 @@ gcm_init_v8: bx lr .size gcm_init_v8,.-gcm_init_v8 .globl gcm_gmult_v8 +.hidden gcm_gmult_v8 .type gcm_gmult_v8,%function .align 4 gcm_gmult_v8: @@ -67,10 +69,10 @@ gcm_gmult_v8: #endif vext.8 q3,q9,q9,#8 -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo +.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) +.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi +.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 @@ -96,6 +98,7 @@ gcm_gmult_v8: bx lr .size gcm_gmult_v8,.-gcm_gmult_v8 .globl gcm_ghash_v8 +.hidden gcm_ghash_v8 .type gcm_ghash_v8,%function .align 4 gcm_ghash_v8: @@ -135,7 +138,7 @@ gcm_ghash_v8: #endif vext.8 q7,q9,q9,#8 veor q3,q3,q0 @ I[i]^=Xi -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 +.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 veor q9,q9,q7 @ Karatsuba pre-processing .byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 b .Loop_mod2x_v8 @@ -144,14 +147,14 @@ gcm_ghash_v8: .Loop_mod2x_v8: vext.8 q10,q3,q3,#8 subs r3,r3,#32 @ is there more data? -.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo +.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo movlo r12,#0 @ is it time to zero r12? .byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 veor q10,q10,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi +.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi veor q0,q0,q4 @ accumulate -.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) +.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] veor q2,q2,q6 @@ -176,7 +179,7 @@ gcm_ghash_v8: vext.8 q7,q9,q9,#8 vext.8 q3,q8,q8,#8 veor q0,q1,q10 -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 +.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 veor q3,q3,q2 @ accumulate q3 early vext.8 q10,q0,q0,#8 @ 2nd phase of reduction @@ -197,10 +200,10 @@ gcm_ghash_v8: veor q3,q3,q0 @ inp^=Xi veor q9,q8,q10 @ q9 is rotated inp^Xi -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo +.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) +.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi +.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) vext.8 q9,q0,q2,#8 @ Karatsuba post-processing veor q10,q0,q2 @@ -230,4 +233,4 @@ gcm_ghash_v8: .byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/sha/sha1-armv4-large.S b/packager/third_party/boringssl/linux-arm/crypto/sha/sha1-armv4-large.S index 4911458b26..36955faf19 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/sha/sha1-armv4-large.S +++ b/packager/third_party/boringssl/linux-arm/crypto/sha/sha1-armv4-large.S @@ -1,10 +1,11 @@ #if defined(__arm__) -#include "arm_arch.h" +#include .text .code 32 .globl sha1_block_data_order +.hidden sha1_block_data_order .type sha1_block_data_order,%function .align 5 @@ -1459,4 +1460,4 @@ sha1_block_data_order_armv8: .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P #endif -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/sha/sha256-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/sha/sha256-armv4.S index ac9f2f164d..6040041322 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/sha/sha256-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/sha/sha256-armv4.S @@ -38,7 +38,7 @@ @ Add ARMv8 code path performing at 2.0 cpb on Apple A7. #ifndef __KERNEL__ -# include "arm_arch.h" +# include #else # define __ARM_ARCH__ __LINUX_ARM_ARCH__ # define __ARM_MAX_ARCH__ 7 @@ -85,6 +85,7 @@ K256: .align 5 .globl sha256_block_data_order +.hidden sha256_block_data_order .type sha256_block_data_order,%function sha256_block_data_order: .Lsha256_block_data_order: @@ -1875,6 +1876,7 @@ sha256_block_data_order: .fpu neon .globl sha256_block_data_order_neon +.hidden sha256_block_data_order_neon .type sha256_block_data_order_neon,%function .align 4 sha256_block_data_order_neon: @@ -2815,4 +2817,4 @@ sha256_block_data_order_armv8: .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P #endif -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/sha/sha512-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/sha/sha512-armv4.S index c794f87fcf..93a7bf8566 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/sha/sha512-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/sha/sha512-armv4.S @@ -47,7 +47,7 @@ @ was reflected in below two parameters as 0 and 4. Now caller is @ expected to maintain native byte order for whole 64-bit values. #ifndef __KERNEL__ -# include "arm_arch.h" +# include # define VFP_ABI_PUSH vstmdb sp!,{d8-d15} # define VFP_ABI_POP vldmia sp!,{d8-d15} #else @@ -133,6 +133,7 @@ K512: #endif .globl sha512_block_data_order +.hidden sha512_block_data_order .type sha512_block_data_order,%function sha512_block_data_order: .Lsha512_block_data_order: @@ -147,7 +148,7 @@ sha512_block_data_order: #ifdef __APPLE__ ldr r12,[r12] #endif - tst r12,#1 + tst r12,#ARMV7_NEON bne .LNEON #endif add r2,r1,r2,lsl#7 @ len to point at the end of inp @@ -533,6 +534,7 @@ sha512_block_data_order: .fpu neon .globl sha512_block_data_order_neon +.hidden sha512_block_data_order_neon .type sha512_block_data_order_neon,%function .align 4 sha512_block_data_order_neon: @@ -1866,4 +1868,4 @@ sha512_block_data_order_neon: .comm OPENSSL_armcap_P,4,4 .hidden OPENSSL_armcap_P #endif -#endif \ No newline at end of file +#endif diff --git a/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S b/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S new file mode 100644 index 0000000000..d3c39ace9b --- /dev/null +++ b/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S @@ -0,0 +1,969 @@ +#if defined(__i386__) +.file "chacha-x86.S" +.text +.globl ChaCha20_ctr32 +.hidden ChaCha20_ctr32 +.type ChaCha20_ctr32,@function +.align 16 +ChaCha20_ctr32: +.L_ChaCha20_ctr32_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + xorl %eax,%eax + cmpl 28(%esp),%eax + je .L000no_data + call .Lpic_point +.Lpic_point: + popl %eax + leal OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp + testl $16777216,(%ebp) + jz .L001x86 + testl $512,4(%ebp) + jz .L001x86 + jmp .Lssse3_shortcut +.L001x86: + movl 32(%esp),%esi + movl 36(%esp),%edi + subl $132,%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,80(%esp) + movl %ebx,84(%esp) + movl %ecx,88(%esp) + movl %edx,92(%esp) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + movl %eax,96(%esp) + movl %ebx,100(%esp) + movl %ecx,104(%esp) + movl %edx,108(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + subl $1,%eax + movl %eax,112(%esp) + movl %ebx,116(%esp) + movl %ecx,120(%esp) + movl %edx,124(%esp) + jmp .L002entry +.align 16 +.L003outer_loop: + movl %ebx,156(%esp) + movl %eax,152(%esp) + movl %ecx,160(%esp) +.L002entry: + movl $1634760805,%eax + movl $857760878,4(%esp) + movl $2036477234,8(%esp) + movl $1797285236,12(%esp) + movl 84(%esp),%ebx + movl 88(%esp),%ebp + movl 104(%esp),%ecx + movl 108(%esp),%esi + movl 116(%esp),%edx + movl 120(%esp),%edi + movl %ebx,20(%esp) + movl %ebp,24(%esp) + movl %ecx,40(%esp) + movl %esi,44(%esp) + movl %edx,52(%esp) + movl %edi,56(%esp) + movl 92(%esp),%ebx + movl 124(%esp),%edi + movl 112(%esp),%edx + movl 80(%esp),%ebp + movl 96(%esp),%ecx + movl 100(%esp),%esi + addl $1,%edx + movl %ebx,28(%esp) + movl %edi,60(%esp) + movl %edx,112(%esp) + movl $10,%ebx + jmp .L004loop +.align 16 +.L004loop: + addl %ebp,%eax + movl %ebx,128(%esp) + movl %ebp,%ebx + xorl %eax,%edx + roll $16,%edx + addl %edx,%ecx + xorl %ecx,%ebx + movl 52(%esp),%edi + roll $12,%ebx + movl 20(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,48(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,32(%esp) + roll $16,%edi + movl %ebx,16(%esp) + addl %edi,%esi + movl 40(%esp),%ecx + xorl %esi,%ebp + movl 56(%esp),%edx + roll $12,%ebp + movl 24(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,52(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,36(%esp) + roll $16,%edx + movl %ebp,20(%esp) + addl %edx,%ecx + movl 44(%esp),%esi + xorl %ecx,%ebx + movl 60(%esp),%edi + roll $12,%ebx + movl 28(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,56(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,24(%esp) + addl %edi,%esi + xorl %esi,%ebp + roll $12,%ebp + movl 20(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,%edx + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + roll $16,%edx + movl %ebp,28(%esp) + addl %edx,%ecx + xorl %ecx,%ebx + movl 48(%esp),%edi + roll $12,%ebx + movl 24(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,60(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,40(%esp) + roll $16,%edi + movl %ebx,20(%esp) + addl %edi,%esi + movl 32(%esp),%ecx + xorl %esi,%ebp + movl 52(%esp),%edx + roll $12,%ebp + movl 28(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,48(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,44(%esp) + roll $16,%edx + movl %ebp,24(%esp) + addl %edx,%ecx + movl 36(%esp),%esi + xorl %ecx,%ebx + movl 56(%esp),%edi + roll $12,%ebx + movl 16(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,52(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,28(%esp) + addl %edi,%esi + xorl %esi,%ebp + movl 48(%esp),%edx + roll $12,%ebp + movl 128(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,56(%esp) + xorl %esi,%ebp + roll $7,%ebp + decl %ebx + jnz .L004loop + movl 160(%esp),%ebx + addl $1634760805,%eax + addl 80(%esp),%ebp + addl 96(%esp),%ecx + addl 100(%esp),%esi + cmpl $64,%ebx + jb .L005tail + movl 156(%esp),%ebx + addl 112(%esp),%edx + addl 120(%esp),%edi + xorl (%ebx),%eax + xorl 16(%ebx),%ebp + movl %eax,(%esp) + movl 152(%esp),%eax + xorl 32(%ebx),%ecx + xorl 36(%ebx),%esi + xorl 48(%ebx),%edx + xorl 56(%ebx),%edi + movl %ebp,16(%eax) + movl %ecx,32(%eax) + movl %esi,36(%eax) + movl %edx,48(%eax) + movl %edi,56(%eax) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + xorl 4(%ebx),%ebp + xorl 8(%ebx),%ecx + xorl 12(%ebx),%esi + xorl 20(%ebx),%edx + xorl 24(%ebx),%edi + movl %ebp,4(%eax) + movl %ecx,8(%eax) + movl %esi,12(%eax) + movl %edx,20(%eax) + movl %edi,24(%eax) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + xorl 28(%ebx),%ebp + xorl 40(%ebx),%ecx + xorl 44(%ebx),%esi + xorl 52(%ebx),%edx + xorl 60(%ebx),%edi + leal 64(%ebx),%ebx + movl %ebp,28(%eax) + movl (%esp),%ebp + movl %ecx,40(%eax) + movl 160(%esp),%ecx + movl %esi,44(%eax) + movl %edx,52(%eax) + movl %edi,60(%eax) + movl %ebp,(%eax) + leal 64(%eax),%eax + subl $64,%ecx + jnz .L003outer_loop + jmp .L006done +.L005tail: + addl 112(%esp),%edx + addl 120(%esp),%edi + movl %eax,(%esp) + movl %ebp,16(%esp) + movl %ecx,32(%esp) + movl %esi,36(%esp) + movl %edx,48(%esp) + movl %edi,56(%esp) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + movl %ebp,4(%esp) + movl %ecx,8(%esp) + movl %esi,12(%esp) + movl %edx,20(%esp) + movl %edi,24(%esp) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + movl %ebp,28(%esp) + movl 156(%esp),%ebp + movl %ecx,40(%esp) + movl 152(%esp),%ecx + movl %esi,44(%esp) + xorl %esi,%esi + movl %edx,52(%esp) + movl %edi,60(%esp) + xorl %eax,%eax + xorl %edx,%edx +.L007tail_loop: + movb (%esi,%ebp,1),%al + movb (%esp,%esi,1),%dl + leal 1(%esi),%esi + xorb %dl,%al + movb %al,-1(%ecx,%esi,1) + decl %ebx + jnz .L007tail_loop +.L006done: + addl $132,%esp +.L000no_data: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin +.globl ChaCha20_ssse3 +.hidden ChaCha20_ssse3 +.type ChaCha20_ssse3,@function +.align 16 +ChaCha20_ssse3: +.L_ChaCha20_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +.Lssse3_shortcut: + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal .Lssse3_data-.Lpic_point(%eax),%eax + movdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb .L0081x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + movdqu (%edx),%xmm7 + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + paddd 48(%eax),%xmm0 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + psubd 64(%eax),%xmm0 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,64(%ebp) + movdqa %xmm1,80(%ebp) + movdqa %xmm2,96(%ebp) + movdqa %xmm3,112(%ebp) + movdqu 16(%edx),%xmm3 + movdqa %xmm4,-64(%ebp) + movdqa %xmm5,-48(%ebp) + movdqa %xmm6,-32(%ebp) + movdqa %xmm7,-16(%ebp) + movdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,(%ebp) + movdqa %xmm1,16(%ebp) + movdqa %xmm2,32(%ebp) + movdqa %xmm3,48(%ebp) + movdqa %xmm4,-128(%ebp) + movdqa %xmm5,-112(%ebp) + movdqa %xmm6,-96(%ebp) + movdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp .L009outer_loop +.align 16 +.L009outer_loop: + movdqa -112(%ebp),%xmm1 + movdqa -96(%ebp),%xmm2 + movdqa -80(%ebp),%xmm3 + movdqa -48(%ebp),%xmm5 + movdqa -32(%ebp),%xmm6 + movdqa -16(%ebp),%xmm7 + movdqa %xmm1,-112(%ebx) + movdqa %xmm2,-96(%ebx) + movdqa %xmm3,-80(%ebx) + movdqa %xmm5,-48(%ebx) + movdqa %xmm6,-32(%ebx) + movdqa %xmm7,-16(%ebx) + movdqa 32(%ebp),%xmm2 + movdqa 48(%ebp),%xmm3 + movdqa 64(%ebp),%xmm4 + movdqa 80(%ebp),%xmm5 + movdqa 96(%ebp),%xmm6 + movdqa 112(%ebp),%xmm7 + paddd 64(%eax),%xmm4 + movdqa %xmm2,32(%ebx) + movdqa %xmm3,48(%ebx) + movdqa %xmm4,64(%ebx) + movdqa %xmm5,80(%ebx) + movdqa %xmm6,96(%ebx) + movdqa %xmm7,112(%ebx) + movdqa %xmm4,64(%ebp) + movdqa -128(%ebp),%xmm0 + movdqa %xmm4,%xmm6 + movdqa -64(%ebp),%xmm3 + movdqa (%ebp),%xmm4 + movdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 16 +.L010loop: + paddd %xmm3,%xmm0 + movdqa %xmm3,%xmm2 + pxor %xmm0,%xmm6 + pshufb (%eax),%xmm6 + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -48(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 80(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,64(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-64(%ebx) + paddd %xmm7,%xmm5 + movdqa 32(%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -32(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 96(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,80(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,16(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-48(%ebx) + paddd %xmm6,%xmm4 + movdqa 48(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -16(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 112(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,96(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-32(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa -48(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,%xmm6 + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + pshufb (%eax),%xmm6 + movdqa %xmm3,-16(%ebx) + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -32(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 64(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,112(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,32(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-48(%ebx) + paddd %xmm7,%xmm5 + movdqa (%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -16(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 80(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,64(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,48(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-32(%ebx) + paddd %xmm6,%xmm4 + movdqa 16(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -64(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 96(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,80(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-16(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 64(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,96(%ebx) + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + por %xmm1,%xmm3 + decl %edx + jnz .L010loop + movdqa %xmm3,-64(%ebx) + movdqa %xmm4,(%ebx) + movdqa %xmm5,16(%ebx) + movdqa %xmm6,64(%ebx) + movdqa %xmm7,96(%ebx) + movdqa -112(%ebx),%xmm1 + movdqa -96(%ebx),%xmm2 + movdqa -80(%ebx),%xmm3 + paddd -128(%ebp),%xmm0 + paddd -112(%ebp),%xmm1 + paddd -96(%ebp),%xmm2 + paddd -80(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa -64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa -48(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa -32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa -16(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd -64(%ebp),%xmm0 + paddd -48(%ebp),%xmm1 + paddd -32(%ebp),%xmm2 + paddd -16(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa (%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 16(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 48(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd (%ebp),%xmm0 + paddd 16(%ebp),%xmm1 + paddd 32(%ebp),%xmm2 + paddd 48(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa 64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 80(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 96(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 112(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd 64(%ebp),%xmm0 + paddd 80(%ebp),%xmm1 + paddd 96(%ebp),%xmm2 + paddd 112(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 208(%esi),%esi + pxor %xmm0,%xmm4 + pxor %xmm1,%xmm5 + pxor %xmm2,%xmm6 + pxor %xmm3,%xmm7 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc .L009outer_loop + addl $256,%ecx + jz .L011done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + movd 64(%ebp),%xmm2 + movdqu (%ebx),%xmm3 + paddd 96(%eax),%xmm2 + pand 112(%eax),%xmm3 + por %xmm2,%xmm3 +.L0081x: + movdqa 32(%eax),%xmm0 + movdqu (%edx),%xmm1 + movdqu 16(%edx),%xmm2 + movdqa (%eax),%xmm6 + movdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movl $10,%edx + jmp .L012loop1x +.align 16 +.L013outer1x: + movdqa 80(%eax),%xmm3 + movdqa (%esp),%xmm0 + movdqa 16(%esp),%xmm1 + movdqa 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + movl $10,%edx + movdqa %xmm3,48(%esp) + jmp .L012loop1x +.align 16 +.L012loop1x: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decl %edx + jnz .L012loop1x + paddd (%esp),%xmm0 + paddd 16(%esp),%xmm1 + paddd 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + cmpl $64,%ecx + jb .L014tail + movdqu (%esi),%xmm4 + movdqu 16(%esi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%esi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%esi),%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + leal 64(%esi),%esi + movdqu %xmm0,(%edi) + movdqu %xmm1,16(%edi) + movdqu %xmm2,32(%edi) + movdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz .L013outer1x + jmp .L011done +.L014tail: + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +.L015tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz .L015tail_loop +.L011done: + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin +.align 64 +.Lssse3_data: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.long 1634760805,857760878,2036477234,1797285236 +.long 0,1,2,3 +.long 4,4,4,4 +.long 1,0,0,0 +.long 4,0,0,0 +.long 0,-1,-1,-1 +.align 64 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +.byte 114,103,62,0 +#endif diff --git a/packager/third_party/boringssl/linux-x86/crypto/rc4/rc4-586.S b/packager/third_party/boringssl/linux-x86/crypto/rc4/rc4-586.S index a5cce47c09..d245589eca 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/rc4/rc4-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/rc4/rc4-586.S @@ -347,39 +347,4 @@ asm_RC4_set_key: popl %ebp ret .size asm_RC4_set_key,.-.L_asm_RC4_set_key_begin -.globl RC4_options -.hidden RC4_options -.type RC4_options,@function -.align 16 -RC4_options: -.L_RC4_options_begin: - call .L018pic_point -.L018pic_point: - popl %eax - leal .L019opts-.L018pic_point(%eax),%eax - call .L020PIC_me_up -.L020PIC_me_up: - popl %edx - leal OPENSSL_ia32cap_P-.L020PIC_me_up(%edx),%edx - movl (%edx),%edx - btl $20,%edx - jc .L0211xchar - btl $26,%edx - jnc .L022ret - addl $25,%eax - ret -.L0211xchar: - addl $12,%eax -.L022ret: - ret -.align 64 -.L019opts: -.byte 114,99,52,40,52,120,44,105,110,116,41,0 -.byte 114,99,52,40,49,120,44,99,104,97,114,41,0 -.byte 114,99,52,40,56,120,44,109,109,120,41,0 -.byte 82,67,52,32,102,111,114,32,120,56,54,44,32,67,82,89 -.byte 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114 -.byte 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 -.size RC4_options,.-.L_RC4_options_begin #endif diff --git a/packager/third_party/boringssl/linux-x86/crypto/sha/sha1-586.S b/packager/third_party/boringssl/linux-x86/crypto/sha/sha1-586.S index 808ccac517..58d0bc1277 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/sha/sha1-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/sha/sha1-586.S @@ -23,8 +23,11 @@ sha1_block_data_order: movl 8(%esi),%ecx testl $16777216,%eax jz .L001x86 - testl $536870912,%ecx - jnz .Lshaext_shortcut + andl $268435456,%edx + andl $1073741824,%eax + orl %edx,%eax + cmpl $1342177280,%eax + je .Lavx_shortcut jmp .Lssse3_shortcut .align 16 .L001x86: @@ -1393,177 +1396,6 @@ sha1_block_data_order: popl %ebp ret .size sha1_block_data_order,.-.L_sha1_block_data_order_begin -.hidden _sha1_block_data_order_shaext -.type _sha1_block_data_order_shaext,@function -.align 16 -_sha1_block_data_order_shaext: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call .L003pic_point -.L003pic_point: - popl %ebp - leal .LK_XX_XX-.L003pic_point(%ebp),%ebp -.Lshaext_shortcut: - movl 20(%esp),%edi - movl %esp,%ebx - movl 24(%esp),%esi - movl 28(%esp),%ecx - subl $32,%esp - movdqu (%edi),%xmm0 - movd 16(%edi),%xmm1 - andl $-32,%esp - movdqa 80(%ebp),%xmm3 - movdqu (%esi),%xmm4 - pshufd $27,%xmm0,%xmm0 - movdqu 16(%esi),%xmm5 - pshufd $27,%xmm1,%xmm1 - movdqu 32(%esi),%xmm6 -.byte 102,15,56,0,227 - movdqu 48(%esi),%xmm7 -.byte 102,15,56,0,235 -.byte 102,15,56,0,243 -.byte 102,15,56,0,251 - jmp .L004loop_shaext -.align 16 -.L004loop_shaext: - decl %ecx - leal 64(%esi),%eax - movdqa %xmm1,(%esp) - paddd %xmm4,%xmm1 - cmovnel %eax,%esi - movdqa %xmm0,16(%esp) -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 - movdqu (%esi),%xmm4 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,213 - movdqu 16(%esi),%xmm5 -.byte 102,15,56,0,227 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,206 - movdqu 32(%esi),%xmm6 -.byte 102,15,56,0,235 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,215 - movdqu 48(%esi),%xmm7 -.byte 102,15,56,0,243 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 - movdqa (%esp),%xmm2 -.byte 102,15,56,0,251 -.byte 15,56,200,202 - paddd 16(%esp),%xmm0 - jnz .L004loop_shaext - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 - movdqu %xmm0,(%edi) - movd %xmm1,16(%edi) - movl %ebx,%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size _sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext .hidden _sha1_block_data_order_ssse3 .type _sha1_block_data_order_ssse3,@function .align 16 @@ -1572,10 +1404,10 @@ _sha1_block_data_order_ssse3: pushl %ebx pushl %esi pushl %edi - call .L005pic_point -.L005pic_point: + call .L003pic_point +.L003pic_point: popl %ebp - leal .LK_XX_XX-.L005pic_point(%ebp),%ebp + leal .LK_XX_XX-.L003pic_point(%ebp),%ebp .Lssse3_shortcut: movdqa (%ebp),%xmm7 movdqa 16(%ebp),%xmm0 @@ -1628,9 +1460,9 @@ _sha1_block_data_order_ssse3: xorl %edx,%ebp pshufd $238,%xmm0,%xmm4 andl %ebp,%esi - jmp .L006loop + jmp .L004loop .align 16 -.L006loop: +.L004loop: rorl $2,%ebx xorl %edx,%esi movl %eax,%ebp @@ -2533,7 +2365,7 @@ _sha1_block_data_order_ssse3: addl %edx,%ecx movl 196(%esp),%ebp cmpl 200(%esp),%ebp - je .L007done + je .L005done movdqa 160(%esp),%xmm7 movdqa 176(%esp),%xmm6 movdqu (%ebp),%xmm0 @@ -2668,9 +2500,9 @@ _sha1_block_data_order_ssse3: pshufd $238,%xmm0,%xmm4 andl %ebx,%esi movl %ebp,%ebx - jmp .L006loop + jmp .L004loop .align 16 -.L007done: +.L005done: addl 16(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp @@ -2784,6 +2616,1177 @@ _sha1_block_data_order_ssse3: popl %ebp ret .size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 +.hidden _sha1_block_data_order_avx +.type _sha1_block_data_order_avx,@function +.align 16 +_sha1_block_data_order_avx: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L006pic_point +.L006pic_point: + popl %ebp + leal .LK_XX_XX-.L006pic_point(%ebp),%ebp +.Lavx_shortcut: + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) + shll $6,%edx + vmovdqa %xmm7,160(%esp) + addl %ebp,%edx + vmovdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) + movl %ecx,%ebp + vmovdqa %xmm5,16(%esp) + xorl %edx,%ebp + vmovdqa %xmm6,32(%esp) + andl %ebp,%esi + jmp .L007loop +.align 16 +.L007loop: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%ebp + addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%edi + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) + movl %edi,%esi + addl 4(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%edi,%edi + addl %ebp,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm6 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 + movl %ecx,%esi + addl 12(%esp),%ebx + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 + addl %ebp,%ebx + andl %edx,%esi + vmovdqa 96(%esp),%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%ebp + addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vmovdqa %xmm0,(%esp) + movl %eax,%esi + addl 20(%esp),%edi + vpxor %xmm7,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %ebp,%edi + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm7 + xorl %ecx,%ebx + addl %eax,%edi + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 + movl %edx,%esi + addl 28(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 + addl %ebp,%ecx + andl %edi,%esi + vmovdqa 112(%esp),%xmm1 + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%ebp + addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 + xorl %edi,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx + xorl %edi,%ebp + vmovdqa %xmm1,16(%esp) + movl %ebx,%esi + addl 36(%esp),%eax + vpxor %xmm0,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + addl %ebp,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vpxor %xmm1,%xmm6,%xmm6 + movl %edi,%esi + addl 44(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 + addl %ebp,%edx + andl %eax,%esi + vmovdqa 112(%esp),%xmm2 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%ebp + addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 + addl %esi,%ecx + andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%edi + addl %edx,%ecx + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vmovdqa %xmm2,32(%esp) + movl %ecx,%esi + addl 52(%esp),%ebx + vpxor %xmm1,%xmm7,%xmm7 + xorl %edi,%edx + shldl $5,%ecx,%ecx + addl %ebp,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm1 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vpxor %xmm2,%xmm7,%xmm7 + movl %eax,%esi + addl 60(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 + addl %ebp,%edi + andl %ebx,%esi + vmovdqa 112(%esp),%xmm3 + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) + xorl %ebx,%eax + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + addl %esi,%edx + andl %eax,%ebp + vpxor %xmm2,%xmm0,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 + xorl %edi,%edx + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 + addl %esi,%ebx + andl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm3,%xmm1,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm3,%xmm1,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + vmovdqa 64(%esp),%xmm3 + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm4,%xmm2,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vpor %xmm4,%xmm2,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + vmovdqa 80(%esp),%xmm4 + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + vmovdqa 96(%esp),%xmm5 + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 + addl (%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi + addl %edx,%ecx + vpxor %xmm6,%xmm4,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm6,%xmm4,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + vmovdqa 64(%esp),%xmm6 + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + addl %edi,%edx + vpxor %xmm7,%xmm5,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm7,%xmm5,%xmm5 + addl 28(%esp),%eax + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax + addl %esi,%edi + vpxor %xmm0,%xmm6,%xmm6 + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 + movl %edx,%ebp + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 + movl %edi,%ebp + xorl %ebx,%esi + shldl $5,%edi,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + addl (%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm2,%xmm0,%xmm0 + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 + movl %eax,%ebp + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm3,%xmm1,%xmm1 + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 + movl %ebx,%ebp + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi + addl %esi,%edx + vpxor %xmm4,%xmm2,%xmm2 + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 + xorl %eax,%edi + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 + movl %ecx,%ebp + xorl %edi,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm7,48(%esp) + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je .L008done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 + addl $64,%ebp + vpshufb %xmm6,%xmm0,%xmm0 + movl %ebp,196(%esp) + vmovdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,(%esp) + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%ebp + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vmovdqa %xmm5,16(%esp) + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %edi,%ebp + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vmovdqa %xmm6,32(%esp) + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,%ebx + movl %ecx,8(%ebp) + xorl %edx,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movl %esi,%ebp + andl %ebx,%esi + movl %ebp,%ebx + jmp .L007loop +.align 16 +.L008done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroall + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx .align 64 .LK_XX_XX: .long 1518500249,1518500249,1518500249,1518500249 diff --git a/packager/third_party/boringssl/linux-x86/crypto/sha/sha256-586.S b/packager/third_party/boringssl/linux-x86/crypto/sha/sha256-586.S index 08d948432b..38acbd8374 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/sha/sha256-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/sha/sha256-586.S @@ -37,11 +37,10 @@ sha256_block_data_order: jz .L003no_xmm andl $1073741824,%ecx andl $268435968,%ebx - testl $536870912,%edx - jnz .L004shaext orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx + je .L004AVX testl $512,%ebx jnz .L005SSSE3 .L003no_xmm: @@ -3167,204 +3166,6 @@ sha256_block_data_order: popl %ebp ret .align 32 -.L004shaext: - subl $32,%esp - movdqu (%esi),%xmm1 - leal 128(%ebp),%ebp - movdqu 16(%esi),%xmm2 - movdqa 128(%ebp),%xmm7 - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp .L010loop_shaext -.align 16 -.L010loop_shaext: - movdqu (%edi),%xmm3 - movdqu 16(%edi),%xmm4 - movdqu 32(%edi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%edi),%xmm6 - movdqa %xmm2,16(%esp) - movdqa -128(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - nop - movdqa %xmm1,(%esp) -.byte 15,56,203,202 - movdqa -112(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - leal 64(%edi),%edi -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa -96(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa -80(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa -64(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa -48(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa -32(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa -16(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa (%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 16(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 32(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 48(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 64(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 80(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - movdqa 96(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa 128(%ebp),%xmm7 -.byte 15,56,203,202 - movdqa 112(%ebp),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - cmpl %edi,%eax - nop -.byte 15,56,203,202 - paddd 16(%esp),%xmm2 - paddd (%esp),%xmm1 - jnz .L010loop_shaext - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - movl 44(%esp),%esp - movdqu %xmm1,(%esi) - movdqu %xmm2,16(%esi) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 32 .L005SSSE3: leal -96(%esp),%esp movl (%esi),%eax @@ -3384,9 +3185,9 @@ sha256_block_data_order: movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp .L011grand_ssse3 + jmp .L010grand_ssse3 .align 16 -.L011grand_ssse3: +.L010grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 @@ -3409,9 +3210,9 @@ sha256_block_data_order: paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp .L012ssse3_00_47 + jmp .L011ssse3_00_47 .align 16 -.L012ssse3_00_47: +.L011ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 @@ -4054,7 +3855,7 @@ sha256_block_data_order: addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne .L012ssse3_00_47 + jne .L011ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi @@ -4568,12 +4369,1193 @@ sha256_block_data_order: movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb .L011grand_ssse3 + jb .L010grand_ssse3 movl 108(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret +.align 32 +.L004AVX: + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L012grand_avx +.align 32 +.L012grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L013avx_00_47 +.align 16 +.L013avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L013avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L012grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin #endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/aes/aes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/aes/aes-x86_64.S index 5f4b057ffa..361e84c77f 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/aes/aes-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/aes/aes-x86_64.S @@ -82,8 +82,8 @@ _x86_64_AES_encrypt: movl 0(%r14,%rdi,8),%edi movl 0(%r14,%rbp,8),%ebp - andl $65280,%edi - andl $65280,%ebp + andl $0x0000ff00,%edi + andl $0x0000ff00,%ebp xorl %edi,%r10d xorl %ebp,%r11d @@ -95,8 +95,8 @@ _x86_64_AES_encrypt: movl 0(%r14,%rsi,8),%esi movl 0(%r14,%rdi,8),%edi - andl $65280,%esi - andl $65280,%edi + andl $0x0000ff00,%esi + andl $0x0000ff00,%edi shrl $16,%ebx xorl %esi,%r12d xorl %edi,%r8d @@ -109,9 +109,9 @@ _x86_64_AES_encrypt: movl 0(%r14,%rdi,8),%edi movl 0(%r14,%rbp,8),%ebp - andl $16711680,%esi - andl $16711680,%edi - andl $16711680,%ebp + andl $0x00ff0000,%esi + andl $0x00ff0000,%edi + andl $0x00ff0000,%ebp xorl %esi,%r10d xorl %edi,%r11d @@ -124,9 +124,9 @@ _x86_64_AES_encrypt: movl 2(%r14,%rdi,8),%edi movl 2(%r14,%rbp,8),%ebp - andl $16711680,%esi - andl $4278190080,%edi - andl $4278190080,%ebp + andl $0x00ff0000,%esi + andl $0xff000000,%edi + andl $0xff000000,%ebp xorl %esi,%r8d xorl %edi,%r10d @@ -139,8 +139,8 @@ _x86_64_AES_encrypt: movl 2(%r14,%rdi,8),%edi movl 16+0(%r15),%eax - andl $4278190080,%esi - andl $4278190080,%edi + andl $0xff000000,%esi + andl $0xff000000,%edi xorl %esi,%r12d xorl %edi,%r8d @@ -242,8 +242,8 @@ _x86_64_AES_encrypt_compact: xorl %r8d,%edx cmpq 16(%rsp),%r15 je .Lenc_compact_done - movl $2155905152,%r10d - movl $2155905152,%r11d + movl $0x80808080,%r10d + movl $0x80808080,%r11d andl %eax,%r10d andl %ebx,%r11d movl %r10d,%esi @@ -254,10 +254,10 @@ _x86_64_AES_encrypt_compact: leal (%rbx,%rbx,1),%r9d subl %r10d,%esi subl %r11d,%edi - andl $4278124286,%r8d - andl $4278124286,%r9d - andl $454761243,%esi - andl $454761243,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi movl %eax,%r10d movl %ebx,%r11d xorl %esi,%r8d @@ -265,9 +265,9 @@ _x86_64_AES_encrypt_compact: xorl %r8d,%eax xorl %r9d,%ebx - movl $2155905152,%r12d + movl $0x80808080,%r12d roll $24,%eax - movl $2155905152,%ebp + movl $0x80808080,%ebp roll $24,%ebx andl %ecx,%r12d andl %edx,%ebp @@ -290,10 +290,10 @@ _x86_64_AES_encrypt_compact: xorl %r10d,%eax xorl %r11d,%ebx - andl $4278124286,%r8d - andl $4278124286,%r9d - andl $454761243,%esi - andl $454761243,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi movl %ecx,%r12d movl %edx,%ebp xorl %esi,%r8d @@ -345,7 +345,7 @@ asm_AES_encrypt: andq $-64,%rsp subq %rsp,%rcx negq %rcx - andq $960,%rcx + andq $0x3c0,%rcx subq %rcx,%rsp subq $32,%rsp @@ -370,7 +370,7 @@ asm_AES_encrypt: leaq .LAES_Te+2048(%rip),%r14 leaq 768(%rsp),%rbp subq %r14,%rbp - andq $768,%rbp + andq $0x300,%rbp leaq (%r14,%rbp,1),%r14 call _x86_64_AES_encrypt_compact @@ -791,7 +791,7 @@ asm_AES_decrypt: andq $-64,%rsp subq %rsp,%rcx negq %rcx - andq $960,%rcx + andq $0x3c0,%rcx subq %rcx,%rsp subq $32,%rsp @@ -816,7 +816,7 @@ asm_AES_decrypt: leaq .LAES_Td+2048(%rip),%r14 leaq 768(%rsp),%rbp subq %r14,%rbp - andq $768,%rbp + andq $0x300,%rbp leaq (%r14,%rbp,1),%r14 shrq $3,%rbp addq %rbp,%r14 @@ -1334,9 +1334,9 @@ asm_AES_cbc_encrypt: movq %r14,%r10 leaq 2304(%r14),%r11 movq %r15,%r12 - andq $4095,%r10 - andq $4095,%r11 - andq $4095,%r12 + andq $0xFFF,%r10 + andq $0xFFF,%r11 + andq $0xFFF,%r12 cmpq %r11,%r12 jb .Lcbc_te_break_out @@ -1345,7 +1345,7 @@ asm_AES_cbc_encrypt: jmp .Lcbc_te_ok .Lcbc_te_break_out: subq %r10,%r12 - andq $4095,%r12 + andq $0xFFF,%r12 addq $320,%r12 subq %r12,%r15 .align 4 @@ -1371,7 +1371,7 @@ asm_AES_cbc_encrypt: movq %r15,%r10 subq %r14,%r10 - andq $4095,%r10 + andq $0xfff,%r10 cmpq $2304,%r10 jb .Lcbc_do_ecopy cmpq $4096-248,%r10 @@ -1558,7 +1558,7 @@ asm_AES_cbc_encrypt: leaq -88-63(%rcx),%r10 subq %rbp,%r10 negq %r10 - andq $960,%r10 + andq $0x3c0,%r10 subq %r10,%rbp xchgq %rsp,%rbp @@ -1587,7 +1587,7 @@ asm_AES_cbc_encrypt: leaq 2048(%r14),%r14 leaq 768-8(%rsp),%rax subq %r14,%rax - andq $768,%rax + andq $0x300,%rax leaq (%r14,%rax,1),%r14 cmpq $0,%rbx diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/aes/aesni-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/aes/aesni-x86_64.S index 1d51d5b50e..5709a2d024 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/aes/aesni-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/aes/aesni-x86_64.S @@ -508,7 +508,7 @@ aesni_ecb_encrypt: testl %r8d,%r8d jz .Lecb_decrypt - cmpq $128,%rdx + cmpq $0x80,%rdx jb .Lecb_enc_tail movdqu (%rdi),%xmm2 @@ -520,7 +520,7 @@ aesni_ecb_encrypt: movdqu 96(%rdi),%xmm8 movdqu 112(%rdi),%xmm9 leaq 128(%rdi),%rdi - subq $128,%rdx + subq $0x80,%rdx jmp .Lecb_enc_loop8_enter .align 16 .Lecb_enc_loop8: @@ -548,7 +548,7 @@ aesni_ecb_encrypt: call _aesni_encrypt8 - subq $128,%rdx + subq $0x80,%rdx jnc .Lecb_enc_loop8 movups %xmm2,(%rsi) @@ -562,22 +562,22 @@ aesni_ecb_encrypt: movups %xmm8,96(%rsi) movups %xmm9,112(%rsi) leaq 128(%rsi),%rsi - addq $128,%rdx + addq $0x80,%rdx jz .Lecb_ret .Lecb_enc_tail: movups (%rdi),%xmm2 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lecb_enc_one movups 16(%rdi),%xmm3 je .Lecb_enc_two movups 32(%rdi),%xmm4 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lecb_enc_three movups 48(%rdi),%xmm5 je .Lecb_enc_four movups 64(%rdi),%xmm6 - cmpq $96,%rdx + cmpq $0x60,%rdx jb .Lecb_enc_five movups 80(%rdi),%xmm7 je .Lecb_enc_six @@ -651,7 +651,7 @@ aesni_ecb_encrypt: .align 16 .Lecb_decrypt: - cmpq $128,%rdx + cmpq $0x80,%rdx jb .Lecb_dec_tail movdqu (%rdi),%xmm2 @@ -663,7 +663,7 @@ aesni_ecb_encrypt: movdqu 96(%rdi),%xmm8 movdqu 112(%rdi),%xmm9 leaq 128(%rdi),%rdi - subq $128,%rdx + subq $0x80,%rdx jmp .Lecb_dec_loop8_enter .align 16 .Lecb_dec_loop8: @@ -692,7 +692,7 @@ aesni_ecb_encrypt: call _aesni_decrypt8 movups (%r11),%xmm0 - subq $128,%rdx + subq $0x80,%rdx jnc .Lecb_dec_loop8 movups %xmm2,(%rsi) @@ -714,22 +714,22 @@ aesni_ecb_encrypt: movups %xmm9,112(%rsi) pxor %xmm9,%xmm9 leaq 128(%rsi),%rsi - addq $128,%rdx + addq $0x80,%rdx jz .Lecb_ret .Lecb_dec_tail: movups (%rdi),%xmm2 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lecb_dec_one movups 16(%rdi),%xmm3 je .Lecb_dec_two movups 32(%rdi),%xmm4 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lecb_dec_three movups 48(%rdi),%xmm5 je .Lecb_dec_four movups 64(%rdi),%xmm6 - cmpq $96,%rdx + cmpq $0x60,%rdx jb .Lecb_dec_five movups 80(%rdi),%xmm7 je .Lecb_dec_six @@ -1607,7 +1607,7 @@ aesni_xts_encrypt: movdqa .Lxts_magic(%rip),%xmm8 movdqa %xmm2,%xmm15 - pshufd $95,%xmm2,%xmm9 + pshufd $0x5f,%xmm2,%xmm9 pxor %xmm0,%xmm1 movdqa %xmm9,%xmm14 paddd %xmm9,%xmm9 @@ -1706,7 +1706,7 @@ aesni_xts_encrypt: .byte 102,15,56,220,248 movups 64(%r11),%xmm0 movdqa %xmm8,80(%rsp) - pshufd $95,%xmm15,%xmm9 + pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_enc_loop6 .align 32 .Lxts_enc_loop6: @@ -1845,13 +1845,13 @@ aesni_xts_encrypt: jz .Lxts_enc_done pxor %xmm0,%xmm11 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lxts_enc_one pxor %xmm0,%xmm12 je .Lxts_enc_two pxor %xmm0,%xmm13 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lxts_enc_three pxor %xmm0,%xmm14 je .Lxts_enc_four @@ -2079,7 +2079,7 @@ aesni_xts_decrypt: movdqa .Lxts_magic(%rip),%xmm8 movdqa %xmm2,%xmm15 - pshufd $95,%xmm2,%xmm9 + pshufd $0x5f,%xmm2,%xmm9 pxor %xmm0,%xmm1 movdqa %xmm9,%xmm14 paddd %xmm9,%xmm9 @@ -2178,7 +2178,7 @@ aesni_xts_decrypt: .byte 102,15,56,222,248 movups 64(%r11),%xmm0 movdqa %xmm8,80(%rsp) - pshufd $95,%xmm15,%xmm9 + pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_dec_loop6 .align 32 .Lxts_dec_loop6: @@ -2318,13 +2318,13 @@ aesni_xts_decrypt: jz .Lxts_dec_done pxor %xmm0,%xmm12 - cmpq $32,%rdx + cmpq $0x20,%rdx jb .Lxts_dec_one pxor %xmm0,%xmm13 je .Lxts_dec_two pxor %xmm0,%xmm14 - cmpq $64,%rdx + cmpq $0x40,%rdx jb .Lxts_dec_three je .Lxts_dec_four @@ -2355,7 +2355,7 @@ aesni_xts_decrypt: pcmpgtd %xmm15,%xmm14 movdqu %xmm6,64(%rsi) leaq 80(%rsi),%rsi - pshufd $19,%xmm14,%xmm11 + pshufd $0x13,%xmm14,%xmm11 andq $15,%r9 jz .Lxts_dec_ret @@ -2645,7 +2645,7 @@ aesni_cbc_encrypt: leaq -8(%rax),%rbp movups (%r8),%xmm10 movl %r10d,%eax - cmpq $80,%rdx + cmpq $0x50,%rdx jbe .Lcbc_dec_tail movups (%rcx),%xmm0 @@ -2661,14 +2661,14 @@ aesni_cbc_encrypt: movdqu 80(%rdi),%xmm7 movdqa %xmm6,%xmm15 movl OPENSSL_ia32cap_P+4(%rip),%r9d - cmpq $112,%rdx + cmpq $0x70,%rdx jbe .Lcbc_dec_six_or_seven andl $71303168,%r9d - subq $80,%rdx + subq $0x50,%rdx cmpl $4194304,%r9d je .Lcbc_dec_loop6_enter - subq $32,%rdx + subq $0x20,%rdx leaq 112(%rcx),%rcx jmp .Lcbc_dec_loop8_enter .align 16 @@ -2683,7 +2683,7 @@ aesni_cbc_encrypt: movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 xorq %r11,%r11 - cmpq $112,%rdx + cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 pxor %xmm0,%xmm7 @@ -2868,21 +2868,21 @@ aesni_cbc_encrypt: movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi - subq $128,%rdx + subq $0x80,%rdx ja .Lcbc_dec_loop8 movaps %xmm9,%xmm2 leaq -112(%rcx),%rcx - addq $112,%rdx + addq $0x70,%rdx jle .Lcbc_dec_clear_tail_collected movups %xmm9,(%rsi) leaq 16(%rsi),%rsi - cmpq $80,%rdx + cmpq $0x50,%rdx jbe .Lcbc_dec_tail movaps %xmm11,%xmm2 .Lcbc_dec_six_or_seven: - cmpq $96,%rdx + cmpq $0x60,%rdx ja .Lcbc_dec_seven movaps %xmm7,%xmm8 @@ -2975,33 +2975,33 @@ aesni_cbc_encrypt: movl %r10d,%eax movdqu %xmm6,64(%rsi) leaq 80(%rsi),%rsi - subq $96,%rdx + subq $0x60,%rdx ja .Lcbc_dec_loop6 movdqa %xmm7,%xmm2 - addq $80,%rdx + addq $0x50,%rdx jle .Lcbc_dec_clear_tail_collected movups %xmm7,(%rsi) leaq 16(%rsi),%rsi .Lcbc_dec_tail: movups (%rdi),%xmm2 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_one movups 16(%rdi),%xmm3 movaps %xmm2,%xmm11 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_two movups 32(%rdi),%xmm4 movaps %xmm3,%xmm12 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_three movups 48(%rdi),%xmm5 movaps %xmm4,%xmm13 - subq $16,%rdx + subq $0x10,%rdx jbe .Lcbc_dec_four movups 64(%rdi),%xmm6 @@ -3026,7 +3026,7 @@ aesni_cbc_encrypt: movdqa %xmm6,%xmm2 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 - subq $16,%rdx + subq $0x10,%rdx jmp .Lcbc_dec_tail_collected .align 16 @@ -3345,7 +3345,7 @@ __aesni_set_encrypt_key: pslldq $4,%xmm0 pxor %xmm3,%xmm0 - pshufd $255,%xmm0,%xmm3 + pshufd $0xff,%xmm0,%xmm3 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 @@ -3432,7 +3432,7 @@ __aesni_set_encrypt_key: decl %r10d jz .Ldone_key256 - pshufd $255,%xmm0,%xmm2 + pshufd $0xff,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/aes/bsaes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/aes/bsaes-x86_64.S index 8cfa4df5ba..c5491ce4d0 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/aes/bsaes-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/aes/bsaes-x86_64.S @@ -327,45 +327,45 @@ _bsaes_encrypt8_bitslice: pxor %xmm2,%xmm5 decl %r10d jl .Lenc_done - pshufd $147,%xmm15,%xmm7 - pshufd $147,%xmm0,%xmm8 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 pxor %xmm7,%xmm15 - pshufd $147,%xmm3,%xmm9 + pshufd $0x93,%xmm3,%xmm9 pxor %xmm8,%xmm0 - pshufd $147,%xmm5,%xmm10 + pshufd $0x93,%xmm5,%xmm10 pxor %xmm9,%xmm3 - pshufd $147,%xmm2,%xmm11 + pshufd $0x93,%xmm2,%xmm11 pxor %xmm10,%xmm5 - pshufd $147,%xmm6,%xmm12 + pshufd $0x93,%xmm6,%xmm12 pxor %xmm11,%xmm2 - pshufd $147,%xmm1,%xmm13 + pshufd $0x93,%xmm1,%xmm13 pxor %xmm12,%xmm6 - pshufd $147,%xmm4,%xmm14 + pshufd $0x93,%xmm4,%xmm14 pxor %xmm13,%xmm1 pxor %xmm14,%xmm4 pxor %xmm15,%xmm8 pxor %xmm4,%xmm7 pxor %xmm4,%xmm8 - pshufd $78,%xmm15,%xmm15 + pshufd $0x4E,%xmm15,%xmm15 pxor %xmm0,%xmm9 - pshufd $78,%xmm0,%xmm0 + pshufd $0x4E,%xmm0,%xmm0 pxor %xmm2,%xmm12 pxor %xmm7,%xmm15 pxor %xmm6,%xmm13 pxor %xmm8,%xmm0 pxor %xmm5,%xmm11 - pshufd $78,%xmm2,%xmm7 + pshufd $0x4E,%xmm2,%xmm7 pxor %xmm1,%xmm14 - pshufd $78,%xmm6,%xmm8 + pshufd $0x4E,%xmm6,%xmm8 pxor %xmm3,%xmm10 - pshufd $78,%xmm5,%xmm2 + pshufd $0x4E,%xmm5,%xmm2 pxor %xmm4,%xmm10 - pshufd $78,%xmm4,%xmm6 + pshufd $0x4E,%xmm4,%xmm6 pxor %xmm4,%xmm11 - pshufd $78,%xmm1,%xmm5 + pshufd $0x4E,%xmm1,%xmm5 pxor %xmm11,%xmm7 - pshufd $78,%xmm3,%xmm1 + pshufd $0x4E,%xmm3,%xmm1 pxor %xmm12,%xmm8 pxor %xmm10,%xmm2 pxor %xmm14,%xmm6 @@ -799,24 +799,24 @@ _bsaes_decrypt8: decl %r10d jl .Ldec_done - pshufd $78,%xmm15,%xmm7 - pshufd $78,%xmm2,%xmm13 + pshufd $0x4E,%xmm15,%xmm7 + pshufd $0x4E,%xmm2,%xmm13 pxor %xmm15,%xmm7 - pshufd $78,%xmm4,%xmm14 + pshufd $0x4E,%xmm4,%xmm14 pxor %xmm2,%xmm13 - pshufd $78,%xmm0,%xmm8 + pshufd $0x4E,%xmm0,%xmm8 pxor %xmm4,%xmm14 - pshufd $78,%xmm5,%xmm9 + pshufd $0x4E,%xmm5,%xmm9 pxor %xmm0,%xmm8 - pshufd $78,%xmm3,%xmm10 + pshufd $0x4E,%xmm3,%xmm10 pxor %xmm5,%xmm9 pxor %xmm13,%xmm15 pxor %xmm13,%xmm0 - pshufd $78,%xmm1,%xmm11 + pshufd $0x4E,%xmm1,%xmm11 pxor %xmm3,%xmm10 pxor %xmm7,%xmm5 pxor %xmm8,%xmm3 - pshufd $78,%xmm6,%xmm12 + pshufd $0x4E,%xmm6,%xmm12 pxor %xmm1,%xmm11 pxor %xmm14,%xmm0 pxor %xmm9,%xmm1 @@ -830,45 +830,45 @@ _bsaes_decrypt8: pxor %xmm14,%xmm1 pxor %xmm14,%xmm6 pxor %xmm12,%xmm4 - pshufd $147,%xmm15,%xmm7 - pshufd $147,%xmm0,%xmm8 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 pxor %xmm7,%xmm15 - pshufd $147,%xmm5,%xmm9 + pshufd $0x93,%xmm5,%xmm9 pxor %xmm8,%xmm0 - pshufd $147,%xmm3,%xmm10 + pshufd $0x93,%xmm3,%xmm10 pxor %xmm9,%xmm5 - pshufd $147,%xmm1,%xmm11 + pshufd $0x93,%xmm1,%xmm11 pxor %xmm10,%xmm3 - pshufd $147,%xmm6,%xmm12 + pshufd $0x93,%xmm6,%xmm12 pxor %xmm11,%xmm1 - pshufd $147,%xmm2,%xmm13 + pshufd $0x93,%xmm2,%xmm13 pxor %xmm12,%xmm6 - pshufd $147,%xmm4,%xmm14 + pshufd $0x93,%xmm4,%xmm14 pxor %xmm13,%xmm2 pxor %xmm14,%xmm4 pxor %xmm15,%xmm8 pxor %xmm4,%xmm7 pxor %xmm4,%xmm8 - pshufd $78,%xmm15,%xmm15 + pshufd $0x4E,%xmm15,%xmm15 pxor %xmm0,%xmm9 - pshufd $78,%xmm0,%xmm0 + pshufd $0x4E,%xmm0,%xmm0 pxor %xmm1,%xmm12 pxor %xmm7,%xmm15 pxor %xmm6,%xmm13 pxor %xmm8,%xmm0 pxor %xmm3,%xmm11 - pshufd $78,%xmm1,%xmm7 + pshufd $0x4E,%xmm1,%xmm7 pxor %xmm2,%xmm14 - pshufd $78,%xmm6,%xmm8 + pshufd $0x4E,%xmm6,%xmm8 pxor %xmm5,%xmm10 - pshufd $78,%xmm3,%xmm1 + pshufd $0x4E,%xmm3,%xmm1 pxor %xmm4,%xmm10 - pshufd $78,%xmm4,%xmm6 + pshufd $0x4E,%xmm4,%xmm6 pxor %xmm4,%xmm11 - pshufd $78,%xmm2,%xmm3 + pshufd $0x4E,%xmm2,%xmm3 pxor %xmm11,%xmm7 - pshufd $78,%xmm5,%xmm2 + pshufd $0x4E,%xmm5,%xmm2 pxor %xmm12,%xmm8 pxor %xmm1,%xmm10 pxor %xmm14,%xmm6 @@ -1559,20 +1559,20 @@ bsaes_xts_encrypt: movdqa %xmm7,(%rax) andq $-16,%r14 - subq $128,%rsp + subq $0x80,%rsp movdqa 32(%rbp),%xmm6 pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - subq $128,%r14 + subq $0x80,%r14 jc .Lxts_enc_short jmp .Lxts_enc_loop .align 16 .Lxts_enc_loop: - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -1580,7 +1580,7 @@ bsaes_xts_encrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -1589,7 +1589,7 @@ bsaes_xts_encrypt: pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 movdqu 0(%r12),%xmm7 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -1599,7 +1599,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 16(%r12),%xmm8 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -1609,7 +1609,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 32(%r12),%xmm9 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -1619,7 +1619,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 48(%r12),%xmm10 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -1629,7 +1629,7 @@ bsaes_xts_encrypt: pxor %xmm13,%xmm6 movdqu 64(%r12),%xmm11 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -1673,20 +1673,20 @@ bsaes_xts_encrypt: pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - subq $128,%r14 + subq $0x80,%r14 jnc .Lxts_enc_loop .Lxts_enc_short: - addq $128,%r14 + addq $0x80,%r14 jz .Lxts_enc_done - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -1694,7 +1694,7 @@ bsaes_xts_encrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -1705,7 +1705,7 @@ bsaes_xts_encrypt: movdqu 0(%r12),%xmm7 cmpq $16,%r14 je .Lxts_enc_1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -1717,7 +1717,7 @@ bsaes_xts_encrypt: cmpq $32,%r14 je .Lxts_enc_2 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -1729,7 +1729,7 @@ bsaes_xts_encrypt: cmpq $48,%r14 je .Lxts_enc_3 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -1741,7 +1741,7 @@ bsaes_xts_encrypt: cmpq $64,%r14 je .Lxts_enc_4 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -1753,7 +1753,7 @@ bsaes_xts_encrypt: cmpq $80,%r14 je .Lxts_enc_5 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2019,20 +2019,20 @@ bsaes_xts_decrypt: shlq $4,%rax subq %rax,%r14 - subq $128,%rsp + subq $0x80,%rsp movdqa 32(%rbp),%xmm6 pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - subq $128,%r14 + subq $0x80,%r14 jc .Lxts_dec_short jmp .Lxts_dec_loop .align 16 .Lxts_dec_loop: - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -2040,7 +2040,7 @@ bsaes_xts_decrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -2049,7 +2049,7 @@ bsaes_xts_decrypt: pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 movdqu 0(%r12),%xmm7 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -2059,7 +2059,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 16(%r12),%xmm8 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -2069,7 +2069,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 32(%r12),%xmm9 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -2079,7 +2079,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 48(%r12),%xmm10 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -2089,7 +2089,7 @@ bsaes_xts_decrypt: pxor %xmm13,%xmm6 movdqu 64(%r12),%xmm11 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2133,20 +2133,20 @@ bsaes_xts_decrypt: pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - subq $128,%r14 + subq $0x80,%r14 jnc .Lxts_dec_loop .Lxts_dec_short: - addq $128,%r14 + addq $0x80,%r14 jz .Lxts_dec_done - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -2154,7 +2154,7 @@ bsaes_xts_decrypt: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -2165,7 +2165,7 @@ bsaes_xts_decrypt: movdqu 0(%r12),%xmm7 cmpq $16,%r14 je .Lxts_dec_1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -2177,7 +2177,7 @@ bsaes_xts_decrypt: cmpq $32,%r14 je .Lxts_dec_2 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -2189,7 +2189,7 @@ bsaes_xts_decrypt: cmpq $48,%r14 je .Lxts_dec_3 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -2201,7 +2201,7 @@ bsaes_xts_decrypt: cmpq $64,%r14 je .Lxts_dec_4 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -2213,7 +2213,7 @@ bsaes_xts_decrypt: cmpq $80,%r14 je .Lxts_dec_5 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2390,7 +2390,7 @@ bsaes_xts_decrypt: pxor %xmm14,%xmm14 movdqa .Lxts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 movdqa %xmm6,%xmm5 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/aes/vpaes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/aes/vpaes-x86_64.S index 1d124246af..4dfafa97ea 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/aes/vpaes-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/aes/vpaes-x86_64.S @@ -61,7 +61,7 @@ _vpaes_encrypt_core: addq $16,%r11 pxor %xmm0,%xmm3 .byte 102,15,56,0,193 - andq $48,%r11 + andq $0x30,%r11 subq $1,%rax pxor %xmm3,%xmm0 @@ -121,10 +121,10 @@ _vpaes_decrypt_core: pand %xmm9,%xmm0 .byte 102,15,56,0,208 movdqa .Lk_dipt+16(%rip),%xmm0 - xorq $48,%r11 + xorq $0x30,%r11 leaq .Lk_dsbd(%rip),%r10 .byte 102,15,56,0,193 - andq $48,%r11 + andq $0x30,%r11 pxor %xmm5,%xmm2 movdqa .Lk_mc_forward+48(%rip),%xmm5 pxor %xmm2,%xmm0 @@ -243,7 +243,7 @@ _vpaes_schedule_core: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 movdqu %xmm3,(%rdx) - xorq $48,%r8 + xorq $0x30,%r8 .Lschedule_go: cmpl $192,%esi @@ -333,7 +333,7 @@ _vpaes_schedule_core: call _vpaes_schedule_mangle - pshufd $255,%xmm0,%xmm0 + pshufd $0xFF,%xmm0,%xmm0 movdqa %xmm7,%xmm5 movdqa %xmm6,%xmm7 call _vpaes_schedule_low_round @@ -400,8 +400,8 @@ _vpaes_schedule_core: .type _vpaes_schedule_192_smear,@function .align 16 _vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm1 - pshufd $254,%xmm7,%xmm0 + pshufd $0x80,%xmm6,%xmm1 + pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 @@ -438,7 +438,7 @@ _vpaes_schedule_round: pxor %xmm1,%xmm7 - pshufd $255,%xmm0,%xmm0 + pshufd $0xFF,%xmm0,%xmm0 .byte 102,15,58,15,192,1 @@ -597,7 +597,7 @@ _vpaes_schedule_mangle: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 addq $-16,%r8 - andq $48,%r8 + andq $0x30,%r8 movdqu %xmm3,(%rdx) .byte 0xf3,0xc3 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle @@ -616,7 +616,7 @@ vpaes_set_encrypt_key: movl %eax,240(%rdx) movl $0,%ecx - movl $48,%r8d + movl $0x30,%r8d call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S index dd3d3106d1..21531d1c65 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S @@ -466,48 +466,94 @@ rsaz_512_mul_gather4: pushq %r14 pushq %r15 - movl %r9d,%r9d - subq $128+24,%rsp + subq $152,%rsp .Lmul_gather4_body: - movl 64(%rdx,%r9,4),%eax -.byte 102,72,15,110,199 - movl (%rdx,%r9,4),%ebx -.byte 102,72,15,110,201 - movq %r8,128(%rsp) + movd %r9d,%xmm8 + movdqa .Linc+16(%rip),%xmm1 + movdqa .Linc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 + + movdqa 0(%rdx),%xmm8 + movdqa 16(%rdx),%xmm9 + movdqa 32(%rdx),%xmm10 + movdqa 48(%rdx),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rdx),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rdx),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rdx),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rdx),%xmm15 + leaq 128(%rdx),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) - shlq $32,%rax - orq %rax,%rbx movq (%rsi),%rax movq 8(%rsi),%rcx - leaq 128(%rdx,%r9,4),%rbp mulq %rbx movq %rax,(%rsp) movq %rcx,%rax movq %rdx,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 @@ -520,14 +566,12 @@ rsaz_512_mul_gather4: adcq $0,%r13 mulq %rbx - leaq 128(%rbp),%rbp addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r14 movq (%rsi),%rax movq %rdx,%r15 @@ -539,6 +583,35 @@ rsaz_512_mul_gather4: .align 32 .Loop_mul_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + mulq %rbx addq %rax,%r8 movq 8(%rsi),%rax @@ -547,7 +620,6 @@ rsaz_512_mul_gather4: adcq $0,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r9 movq 16(%rsi),%rax adcq $0,%rdx @@ -556,7 +628,6 @@ rsaz_512_mul_gather4: adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r10 movq 24(%rsi),%rax adcq $0,%rdx @@ -565,7 +636,6 @@ rsaz_512_mul_gather4: adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx @@ -574,7 +644,6 @@ rsaz_512_mul_gather4: adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx @@ -599,7 +668,6 @@ rsaz_512_mul_gather4: adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r15 movq (%rsi),%rax adcq $0,%rdx @@ -607,7 +675,6 @@ rsaz_512_mul_gather4: movq %rdx,%r15 adcq $0,%r15 - leaq 128(%rbp),%rbp leaq 8(%rdi),%rdi decl %ecx @@ -622,8 +689,8 @@ rsaz_512_mul_gather4: movq %r14,48(%rdi) movq %r15,56(%rdi) -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp movq (%rsp),%r8 movq 8(%rsp),%r9 @@ -673,7 +740,7 @@ rsaz_512_mul_scatter4: movl %r9d,%r9d subq $128+24,%rsp .Lmul_scatter4_body: - leaq (%r8,%r9,4),%r8 + leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 .byte 102,72,15,110,202 .byte 102,73,15,110,208 @@ -709,30 +776,14 @@ rsaz_512_mul_scatter4: call __rsaz_512_subtract - movl %r8d,0(%rsi) - shrq $32,%r8 - movl %r9d,128(%rsi) - shrq $32,%r9 - movl %r10d,256(%rsi) - shrq $32,%r10 - movl %r11d,384(%rsi) - shrq $32,%r11 - movl %r12d,512(%rsi) - shrq $32,%r12 - movl %r13d,640(%rsi) - shrq $32,%r13 - movl %r14d,768(%rsi) - shrq $32,%r14 - movl %r15d,896(%rsi) - shrq $32,%r15 - movl %r8d,64(%rsi) - movl %r9d,192(%rsi) - movl %r10d,320(%rsi) - movl %r11d,448(%rsi) - movl %r12d,576(%rsi) - movl %r13d,704(%rsi) - movl %r14d,832(%rsi) - movl %r15d,960(%rsi) + movq %r8,0(%rsi) + movq %r9,128(%rsi) + movq %r10,256(%rsi) + movq %r11,384(%rsi) + movq %r12,512(%rsi) + movq %r13,640(%rsi) + movq %r14,768(%rsi) + movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -1087,16 +1138,14 @@ __rsaz_512_mul: .type rsaz_512_scatter4,@function .align 16 rsaz_512_scatter4: - leaq (%rdi,%rdx,4),%rdi + leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp .Loop_scatter .align 16 .Loop_scatter: movq (%rsi),%rax leaq 8(%rsi),%rsi - movl %eax,(%rdi) - shrq $32,%rax - movl %eax,64(%rdi) + movq %rax,(%rdi) leaq 128(%rdi),%rdi decl %r9d jnz .Loop_scatter @@ -1108,20 +1157,73 @@ rsaz_512_scatter4: .type rsaz_512_gather4,@function .align 16 rsaz_512_gather4: - leaq (%rsi,%rdx,4),%rsi + movd %edx,%xmm8 + movdqa .Linc+16(%rip),%xmm1 + movdqa .Linc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 movl $8,%r9d jmp .Loop_gather .align 16 .Loop_gather: - movl (%rsi),%eax - movl 64(%rsi),%r8d + movdqa 0(%rsi),%xmm8 + movdqa 16(%rsi),%xmm9 + movdqa 32(%rsi),%xmm10 + movdqa 48(%rsi),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rsi),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rsi),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rsi),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rsi),%xmm15 leaq 128(%rsi),%rsi - shlq $32,%r8 - orq %r8,%rax - movq %rax,(%rdi) + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,(%rdi) leaq 8(%rdi),%rdi decl %r9d jnz .Loop_gather .byte 0xf3,0xc3 +.LSEH_end_rsaz_512_gather4: .size rsaz_512_gather4,.-rsaz_512_gather4 + +.align 64 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 #endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S b/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S index 4d401c6743..83926ad789 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S @@ -636,20 +636,20 @@ bn_sqr8x_mont: - leaq -64(%rsp,%r9,4),%r11 + leaq -64(%rsp,%r9,2),%r11 movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lsqr8x_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,4),%rsp + leaq -64(%rsp,%r9,2),%rsp jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: - leaq 4096-64(,%r9,4),%r10 - leaq -64(%rsp,%r9,4),%rsp + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -659,58 +659,80 @@ bn_sqr8x_mont: movq %r9,%r10 negq %r9 - leaq 64(%rsp,%r9,2),%r11 movq %r8,32(%rsp) movq %rax,40(%rsp) .Lsqr8x_body: - movq %r9,%rbp -.byte 102,73,15,110,211 - shrq $3+2,%rbp - movl OPENSSL_ia32cap_P+8(%rip),%eax - jmp .Lsqr8x_copy_n - -.align 32 -.Lsqr8x_copy_n: - movq 0(%rcx),%xmm0 - movq 8(%rcx),%xmm1 - movq 16(%rcx),%xmm3 - movq 24(%rcx),%xmm4 - leaq 32(%rcx),%rcx - movdqa %xmm0,0(%r11) - movdqa %xmm1,16(%r11) - movdqa %xmm3,32(%r11) - movdqa %xmm4,48(%r11) - leaq 64(%r11),%r11 - decq %rbp - jnz .Lsqr8x_copy_n - +.byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 call bn_sqr8x_internal - pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - leaq 64(%rsp,%r9,2),%rdx - shrq $3+2,%r9 - movq 40(%rsp),%rsi - jmp .Lsqr8x_zero + + + + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %r9,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub .align 32 -.Lsqr8x_zero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - movdqa %xmm0,32(%rax) - movdqa %xmm0,48(%rax) - leaq 64(%rax),%rax - movdqa %xmm0,0(%rdx) - movdqa %xmm0,16(%rdx) - movdqa %xmm0,32(%rdx) - movdqa %xmm0,48(%rdx) - leaq 64(%rdx),%rdx - decq %r9 - jnz .Lsqr8x_zero +.Lsqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz .Lsqr8x_sub + + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi + +.byte 102,72,15,110,200 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi + jmp .Lsqr8x_cond_copy + +.align 32 +.Lsqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz .Lsqr8x_cond_copy movq $1,%rax movq -48(%rsi),%r15 diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S b/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S index 02edc69b36..554df1ffac 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S @@ -17,46 +17,151 @@ bn_mul_mont_gather5: .Lmul_enter: movl %r9d,%r9d movq %rsp,%rax - movl 8(%rsp),%r10d + movd 8(%rsp),%xmm5 + leaq .Linc(%rip),%r10 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + leaq 2(%r9),%r11 negq %r11 - leaq (%rsp,%r11,8),%rsp + leaq -264(%rsp,%r11,8),%rsp andq $-1024,%rsp movq %rax,8(%rsp,%r9,8) .Lmul_body: - movq %rdx,%r12 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq .Lmagic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%r12,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 + leaq 128(%rdx),%r12 + movdqa 0(%r10),%xmm0 + movdqa 16(%r10),%xmm1 + leaq 24-112(%rsp,%r9,8),%r10 + andq $-16,%r10 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 leaq 256(%r12),%r12 - por %xmm3,%xmm0 - .byte 102,72,15,126,195 movq (%r8),%r8 @@ -65,29 +170,14 @@ bn_mul_mont_gather5: xorq %r14,%r14 xorq %r15,%r15 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -120,14 +210,12 @@ bn_mul_mont_gather5: cmpq %r9,%r15 jne .L1st -.byte 102,72,15,126,195 addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 movq %r10,%r11 @@ -141,33 +229,78 @@ bn_mul_mont_gather5: jmp .Louter .align 16 .Louter: + leaq 24+128(%rsp,%r9,8),%rdx + andq $-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + + movq (%rsi),%rax +.byte 102,72,15,126,195 + xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -203,15 +336,12 @@ bn_mul_mont_gather5: cmpq %r9,%r15 jne .Linner -.byte 102,72,15,126,195 - addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r10,%r13 - movq (%rsp,%r15,8),%r10 + movq (%rsp,%r9,8),%r10 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 xorq %rdx,%rdx @@ -257,6 +387,7 @@ bn_mul_mont_gather5: movq 8(%rsp,%r9,8),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -279,10 +410,10 @@ bn_mul4x_mont_gather5: pushq %r13 pushq %r14 pushq %r15 + .byte 0x67 - movl %r9d,%r10d shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 @@ -292,19 +423,21 @@ bn_mul4x_mont_gather5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmul4xsp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -320,6 +453,7 @@ bn_mul4x_mont_gather5: movq 40(%rsp),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -335,47 +469,141 @@ bn_mul4x_mont_gather5: .align 32 mul4x_internal: shlq $5,%r9 - movl 8(%rax),%r10d - leaq 256(%rdx,%r9,1),%r13 + movd 8(%rax),%xmm5 + leaq .Linc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 shrq $5,%r9 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq .Lmagic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%rdx,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - addq $7,%r11 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - andq $7,%r11 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 - movq -96(%r12),%xmm0 - leaq 256(%r12),%r14 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 .byte 0x67 - por %xmm1,%xmm0 - movq -96(%r14),%xmm1 -.byte 0x67 - pand %xmm7,%xmm3 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 por %xmm2,%xmm0 - movq -32(%r14),%xmm2 -.byte 0x67 - pand %xmm4,%xmm1 -.byte 0x67 - por %xmm3,%xmm0 - movq 32(%r14),%xmm3 - + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 .byte 102,72,15,126,195 - movq 96(%r14),%xmm0 + movq %r13,16+8(%rsp) movq %rdi,56+8(%rsp) @@ -389,26 +617,10 @@ mul4x_internal: movq %rax,%r10 movq (%rcx),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq %r10,%rbp - - - - - - - - leaq 64+8(%rsp,%r11,8),%r14 + leaq 64+8(%rsp),%r14 movq %rdx,%r11 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - leaq 512(%r12),%r12 - por %xmm1,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax @@ -417,7 +629,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -427,7 +639,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -437,7 +649,7 @@ mul4x_internal: .L1st4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -453,7 +665,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -483,7 +695,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -492,7 +704,7 @@ mul4x_internal: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -502,7 +714,7 @@ mul4x_internal: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -518,7 +730,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -531,8 +743,7 @@ mul4x_internal: movq %rdi,-16(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -543,6 +754,63 @@ mul4x_internal: .align 32 .Louter4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx @@ -550,25 +818,11 @@ mul4x_internal: movq (%rcx),%rax adcq $0,%rdx - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - imulq %r10,%rbp -.byte 0x67 movq %rdx,%r11 movq %rdi,(%r14) - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 leaq (%r14,%r9,1),%r14 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 mulq %rbp addq %rax,%r10 @@ -578,7 +832,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -590,7 +844,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp .Linner4x @@ -599,7 +853,7 @@ mul4x_internal: .Linner4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -617,7 +871,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -651,7 +905,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -662,7 +916,7 @@ mul4x_internal: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%r13 @@ -672,7 +926,7 @@ mul4x_internal: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -691,7 +945,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 movq %rbp,%rax - movq -16(%rcx),%rbp + movq -8(%rcx),%rbp adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -706,9 +960,8 @@ mul4x_internal: movq %r13,-24(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 movq %rdi,-16(%r14) - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -719,16 +972,23 @@ mul4x_internal: cmpq 16+8(%rsp),%r12 jb .Louter4x + xorq %rax,%rax subq %r13,%rbp adcq %r15,%r15 orq %r15,%rdi - xorq $1,%rdi + subq %rdi,%rax leaq (%r14,%r9,1),%rbx - leaq (%rcx,%rdi,8),%rbp + movq (%rcx),%r12 + leaq (%rcx),%rbp movq %r9,%rcx sarq $3+2,%rcx movq 56+8(%rsp),%rdi - jmp .Lsqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry .size mul4x_internal,.-mul4x_internal .globl bn_power5 .hidden bn_power5 @@ -742,9 +1002,9 @@ bn_power5: pushq %r13 pushq %r14 pushq %r15 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leal (%r9,%r9,2),%r10d negq %r9 movq (%r8),%r8 @@ -754,19 +1014,20 @@ bn_power5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwr_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -794,10 +1055,15 @@ bn_power5: .byte 102,72,15,110,226 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal .byte 102,72,15,126,209 .byte 102,72,15,126,226 @@ -1342,9 +1608,9 @@ __bn_sqr8x_internal: movq %rbx,-16(%rdi) movq %r8,-8(%rdi) .byte 102,72,15,126,213 -sqr8x_reduction: +__bn_sqr8x_reduction: xorq %rax,%rax - leaq (%rbp,%r9,2),%rcx + leaq (%r9,%rbp,1),%rcx leaq 48+8(%rsp,%r9,2),%rdx movq %rcx,0+8(%rsp) leaq 48+8(%rsp,%r9,1),%rdi @@ -1377,14 +1643,14 @@ sqr8x_reduction: .align 32 .L8x_reduce: mulq %rbx - movq 16(%rbp),%rax + movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rbx,48-8+8(%rsp,%rcx,8) @@ -1393,7 +1659,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq 32+8(%rsp),%rsi @@ -1402,7 +1668,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r11,%r10 @@ -1411,7 +1677,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1419,7 +1685,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1427,7 +1693,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1445,7 +1711,7 @@ sqr8x_reduction: decl %ecx jnz .L8x_reduce - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp xorq %rax,%rax movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp @@ -1471,14 +1737,14 @@ sqr8x_reduction: .L8x_tail: mulq %rbx addq %rax,%r8 - movq 16(%rbp),%rax + movq 8(%rbp),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 leaq 8(%rdi),%rdi @@ -1487,7 +1753,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 @@ -1495,7 +1761,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 @@ -1503,7 +1769,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1511,7 +1777,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1519,7 +1785,7 @@ sqr8x_reduction: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1537,7 +1803,7 @@ sqr8x_reduction: decl %ecx jnz .L8x_tail - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae .L8x_tail_done @@ -1561,6 +1827,15 @@ sqr8x_reduction: .align 32 .L8x_tail_done: addq (%rdx),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + xorq %rax,%rax negq %rsi @@ -1574,7 +1849,7 @@ sqr8x_reduction: adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax - movq -16(%rbp),%rcx + movq -8(%rbp),%rcx xorq %rsi,%rsi .byte 102,72,15,126,213 @@ -1592,44 +1867,62 @@ sqr8x_reduction: cmpq %rdx,%rdi jb .L8x_reduction_loop - - subq %r15,%rcx - leaq (%rdi,%r9,1),%rbx - adcq %rsi,%rsi - movq %r9,%rcx - orq %rsi,%rax -.byte 102,72,15,126,207 - xorq $1,%rax -.byte 102,72,15,126,206 - leaq (%rbp,%rax,8),%rbp - sarq $3+2,%rcx - jmp .Lsqr4x_sub - + .byte 0xf3,0xc3 +.size bn_sqr8x_internal,.-bn_sqr8x_internal +.type __bn_post4x_internal,@function .align 32 +__bn_post4x_internal: + movq 0(%rbp),%r12 + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx +.byte 102,72,15,126,207 + negq %rax +.byte 102,72,15,126,206 + sarq $3+2,%rcx + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry + +.align 16 .Lsqr4x_sub: -.byte 0x66 - movq 0(%rbx),%r12 - movq 8(%rbx),%r13 - sbbq 0(%rbp),%r12 - movq 16(%rbx),%r14 - sbbq 16(%rbp),%r13 - movq 24(%rbx),%r15 - leaq 32(%rbx),%rbx - sbbq 32(%rbp),%r14 + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 movq %r12,0(%rdi) - sbbq 48(%rbp),%r15 - leaq 64(%rbp),%rbp + leaq 32(%rbx),%rbx movq %r13,8(%rdi) + sbbq %r10,%r10 movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz .Lsqr4x_sub + movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 -.size bn_sqr8x_internal,.-bn_sqr8x_internal +.size __bn_post4x_internal,.-__bn_post4x_internal .globl bn_from_montgomery .hidden bn_from_montgomery .type bn_from_montgomery,@function @@ -1652,10 +1945,9 @@ bn_from_mont8x: pushq %r13 pushq %r14 pushq %r15 -.byte 0x67 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 @@ -1665,19 +1957,20 @@ bn_from_mont8x: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lfrom_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -1728,7 +2021,8 @@ bn_from_mont8x: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor %xmm0,%xmm0 leaq 48(%rsp),%rax @@ -1778,46 +2072,170 @@ bn_scatter5: .globl bn_gather5 .hidden bn_gather5 .type bn_gather5,@function -.align 16 +.align 32 bn_gather5: - movl %ecx,%r11d - shrl $3,%ecx - andq $7,%r11 - notl %ecx - leaq .Lmagic_masks(%rip),%rax - andl $3,%ecx - leaq 128(%rdx,%r11,8),%rdx - movq 0(%rax,%rcx,8),%xmm4 - movq 8(%rax,%rcx,8),%xmm5 - movq 16(%rax,%rcx,8),%xmm6 - movq 24(%rax,%rcx,8),%xmm7 - jmp .Lgather -.align 16 -.Lgather: - movq -128(%rdx),%xmm0 - movq -64(%rdx),%xmm1 - pand %xmm4,%xmm0 - movq 0(%rdx),%xmm2 - pand %xmm5,%xmm1 - movq 64(%rdx),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 -.byte 0x67,0x67 - por %xmm2,%xmm0 - leaq 256(%rdx),%rdx - por %xmm3,%xmm0 +.LSEH_begin_bn_gather5: +.byte 0x4c,0x8d,0x14,0x24 +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq .Linc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) + jmp .Lgather + +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 movq %xmm0,(%rdi) leaq 8(%rdi),%rdi subl $1,%esi jnz .Lgather + + leaq (%r10),%rsp .byte 0xf3,0xc3 .LSEH_end_bn_gather5: .size bn_gather5,.-bn_gather5 .align 64 -.Lmagic_masks: -.long 0,0, 0,0, 0,0, -1,-1 -.long 0,0, 0,0, 0,0, 0,0 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 #endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S new file mode 100644 index 0000000000..e994940a3f --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S @@ -0,0 +1,1585 @@ +#if defined(__x86_64__) +.text + +.extern OPENSSL_ia32cap_P +.hidden OPENSSL_ia32cap_P + +.align 64 +.Lzero: +.long 0,0,0,0 +.Lone: +.long 1,0,0,0 +.Linc: +.long 0,1,2,3 +.Lfour: +.long 4,4,4,4 +.Lincy: +.long 0,2,4,6,1,3,5,7 +.Leight: +.long 8,8,8,8,8,8,8,8 +.Lrot16: +.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd +.Lrot24: +.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +.Lsigma: +.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.globl ChaCha20_ctr32 +.hidden ChaCha20_ctr32 +.type ChaCha20_ctr32,@function +.align 64 +ChaCha20_ctr32: + cmpq $0,%rdx + je .Lno_data + movq OPENSSL_ia32cap_P+4(%rip),%r10 + testl $512,%r10d + jnz .LChaCha20_ssse3 + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $64+24,%rsp + + + movdqu (%rcx),%xmm1 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa .Lone(%rip),%xmm4 + + + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + movq %rdx,%rbp + jmp .Loop_outer + +.align 32 +.Loop_outer: + movl $0x61707865,%eax + movl $0x3320646e,%ebx + movl $0x79622d32,%ecx + movl $0x6b206574,%edx + movl 16(%rsp),%r8d + movl 20(%rsp),%r9d + movl 24(%rsp),%r10d + movl 28(%rsp),%r11d + movd %xmm3,%r12d + movl 52(%rsp),%r13d + movl 56(%rsp),%r14d + movl 60(%rsp),%r15d + + movq %rbp,64+0(%rsp) + movl $10,%ebp + movq %rsi,64+8(%rsp) +.byte 102,72,15,126,214 + movq %rdi,64+16(%rsp) + movq %rsi,%rdi + shrq $32,%rdi + jmp .Loop + +.align 32 +.Loop: + addl %r8d,%eax + xorl %eax,%r12d + roll $16,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $16,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $12,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $12,%r9d + addl %r8d,%eax + xorl %eax,%r12d + roll $8,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $8,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $7,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $7,%r9d + movl %esi,32(%rsp) + movl %edi,36(%rsp) + movl 40(%rsp),%esi + movl 44(%rsp),%edi + addl %r10d,%ecx + xorl %ecx,%r14d + roll $16,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $16,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $12,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $12,%r11d + addl %r10d,%ecx + xorl %ecx,%r14d + roll $8,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $8,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $7,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $7,%r11d + addl %r9d,%eax + xorl %eax,%r15d + roll $16,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $16,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $12,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $12,%r10d + addl %r9d,%eax + xorl %eax,%r15d + roll $8,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $8,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $7,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $7,%r10d + movl %esi,40(%rsp) + movl %edi,44(%rsp) + movl 32(%rsp),%esi + movl 36(%rsp),%edi + addl %r11d,%ecx + xorl %ecx,%r13d + roll $16,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $16,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $12,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $12,%r8d + addl %r11d,%ecx + xorl %ecx,%r13d + roll $8,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $8,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $7,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $7,%r8d + decl %ebp + jnz .Loop + movl %edi,36(%rsp) + movl %esi,32(%rsp) + movq 64(%rsp),%rbp + movdqa %xmm2,%xmm1 + movq 64+8(%rsp),%rsi + paddd %xmm4,%xmm3 + movq 64+16(%rsp),%rdi + + addl $0x61707865,%eax + addl $0x3320646e,%ebx + addl $0x79622d32,%ecx + addl $0x6b206574,%edx + addl 16(%rsp),%r8d + addl 20(%rsp),%r9d + addl 24(%rsp),%r10d + addl 28(%rsp),%r11d + addl 48(%rsp),%r12d + addl 52(%rsp),%r13d + addl 56(%rsp),%r14d + addl 60(%rsp),%r15d + paddd 32(%rsp),%xmm1 + + cmpq $64,%rbp + jb .Ltail + + xorl 0(%rsi),%eax + xorl 4(%rsi),%ebx + xorl 8(%rsi),%ecx + xorl 12(%rsi),%edx + xorl 16(%rsi),%r8d + xorl 20(%rsi),%r9d + xorl 24(%rsi),%r10d + xorl 28(%rsi),%r11d + movdqu 32(%rsi),%xmm0 + xorl 48(%rsi),%r12d + xorl 52(%rsi),%r13d + xorl 56(%rsi),%r14d + xorl 60(%rsi),%r15d + leaq 64(%rsi),%rsi + pxor %xmm1,%xmm0 + + movdqa %xmm2,32(%rsp) + movd %xmm3,48(%rsp) + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + movdqu %xmm0,32(%rdi) + movl %r12d,48(%rdi) + movl %r13d,52(%rdi) + movl %r14d,56(%rdi) + movl %r15d,60(%rdi) + leaq 64(%rdi),%rdi + + subq $64,%rbp + jnz .Loop_outer + + jmp .Ldone + +.align 16 +.Ltail: + movl %eax,0(%rsp) + movl %ebx,4(%rsp) + xorq %rbx,%rbx + movl %ecx,8(%rsp) + movl %edx,12(%rsp) + movl %r8d,16(%rsp) + movl %r9d,20(%rsp) + movl %r10d,24(%rsp) + movl %r11d,28(%rsp) + movdqa %xmm1,32(%rsp) + movl %r12d,48(%rsp) + movl %r13d,52(%rsp) + movl %r14d,56(%rsp) + movl %r15d,60(%rsp) + +.Loop_tail: + movzbl (%rsi,%rbx,1),%eax + movzbl (%rsp,%rbx,1),%edx + leaq 1(%rbx),%rbx + xorl %edx,%eax + movb %al,-1(%rdi,%rbx,1) + decq %rbp + jnz .Loop_tail + +.Ldone: + addq $64+24,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx +.Lno_data: + .byte 0xf3,0xc3 +.size ChaCha20_ctr32,.-ChaCha20_ctr32 +.type ChaCha20_ssse3,@function +.align 32 +ChaCha20_ssse3: +.LChaCha20_ssse3: + cmpq $128,%rdx + ja .LChaCha20_4x + +.Ldo_sse3_after_all: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $64+24,%rsp + movdqa .Lsigma(%rip),%xmm0 + movdqu (%rcx),%xmm1 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa .Lrot16(%rip),%xmm6 + movdqa .Lrot24(%rip),%xmm7 + + movdqa %xmm0,0(%rsp) + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + movl $10,%ebp + jmp .Loop_ssse3 + +.align 32 +.Loop_outer_ssse3: + movdqa .Lone(%rip),%xmm3 + movdqa 0(%rsp),%xmm0 + movdqa 16(%rsp),%xmm1 + movdqa 32(%rsp),%xmm2 + paddd 48(%rsp),%xmm3 + movl $10,%ebp + movdqa %xmm3,48(%rsp) + jmp .Loop_ssse3 + +.align 32 +.Loop_ssse3: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decl %ebp + jnz .Loop_ssse3 + paddd 0(%rsp),%xmm0 + paddd 16(%rsp),%xmm1 + paddd 32(%rsp),%xmm2 + paddd 48(%rsp),%xmm3 + + cmpq $64,%rdx + jb .Ltail_ssse3 + + movdqu 0(%rsi),%xmm4 + movdqu 16(%rsi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%rsi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%rsi),%xmm5 + leaq 64(%rsi),%rsi + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + + movdqu %xmm0,0(%rdi) + movdqu %xmm1,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + leaq 64(%rdi),%rdi + + subq $64,%rdx + jnz .Loop_outer_ssse3 + + jmp .Ldone_ssse3 + +.align 16 +.Ltail_ssse3: + movdqa %xmm0,0(%rsp) + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + xorq %rbx,%rbx + +.Loop_tail_ssse3: + movzbl (%rsi,%rbx,1),%eax + movzbl (%rsp,%rbx,1),%ecx + leaq 1(%rbx),%rbx + xorl %ecx,%eax + movb %al,-1(%rdi,%rbx,1) + decq %rdx + jnz .Loop_tail_ssse3 + +.Ldone_ssse3: + addq $64+24,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + .byte 0xf3,0xc3 +.size ChaCha20_ssse3,.-ChaCha20_ssse3 +.type ChaCha20_4x,@function +.align 32 +ChaCha20_4x: +.LChaCha20_4x: + movq %r10,%r11 + shrq $32,%r10 + testq $32,%r10 + jnz .LChaCha20_8x + cmpq $192,%rdx + ja .Lproceed4x + + andq $71303168,%r11 + cmpq $4194304,%r11 + je .Ldo_sse3_after_all + +.Lproceed4x: + leaq -120(%rsp),%r11 + subq $0x148+0,%rsp + movdqa .Lsigma(%rip),%xmm11 + movdqu (%rcx),%xmm15 + movdqu 16(%rcx),%xmm7 + movdqu (%r8),%xmm3 + leaq 256(%rsp),%rcx + leaq .Lrot16(%rip),%r10 + leaq .Lrot24(%rip),%r11 + + pshufd $0x00,%xmm11,%xmm8 + pshufd $0x55,%xmm11,%xmm9 + movdqa %xmm8,64(%rsp) + pshufd $0xaa,%xmm11,%xmm10 + movdqa %xmm9,80(%rsp) + pshufd $0xff,%xmm11,%xmm11 + movdqa %xmm10,96(%rsp) + movdqa %xmm11,112(%rsp) + + pshufd $0x00,%xmm15,%xmm12 + pshufd $0x55,%xmm15,%xmm13 + movdqa %xmm12,128-256(%rcx) + pshufd $0xaa,%xmm15,%xmm14 + movdqa %xmm13,144-256(%rcx) + pshufd $0xff,%xmm15,%xmm15 + movdqa %xmm14,160-256(%rcx) + movdqa %xmm15,176-256(%rcx) + + pshufd $0x00,%xmm7,%xmm4 + pshufd $0x55,%xmm7,%xmm5 + movdqa %xmm4,192-256(%rcx) + pshufd $0xaa,%xmm7,%xmm6 + movdqa %xmm5,208-256(%rcx) + pshufd $0xff,%xmm7,%xmm7 + movdqa %xmm6,224-256(%rcx) + movdqa %xmm7,240-256(%rcx) + + pshufd $0x00,%xmm3,%xmm0 + pshufd $0x55,%xmm3,%xmm1 + paddd .Linc(%rip),%xmm0 + pshufd $0xaa,%xmm3,%xmm2 + movdqa %xmm1,272-256(%rcx) + pshufd $0xff,%xmm3,%xmm3 + movdqa %xmm2,288-256(%rcx) + movdqa %xmm3,304-256(%rcx) + + jmp .Loop_enter4x + +.align 32 +.Loop_outer4x: + movdqa 64(%rsp),%xmm8 + movdqa 80(%rsp),%xmm9 + movdqa 96(%rsp),%xmm10 + movdqa 112(%rsp),%xmm11 + movdqa 128-256(%rcx),%xmm12 + movdqa 144-256(%rcx),%xmm13 + movdqa 160-256(%rcx),%xmm14 + movdqa 176-256(%rcx),%xmm15 + movdqa 192-256(%rcx),%xmm4 + movdqa 208-256(%rcx),%xmm5 + movdqa 224-256(%rcx),%xmm6 + movdqa 240-256(%rcx),%xmm7 + movdqa 256-256(%rcx),%xmm0 + movdqa 272-256(%rcx),%xmm1 + movdqa 288-256(%rcx),%xmm2 + movdqa 304-256(%rcx),%xmm3 + paddd .Lfour(%rip),%xmm0 + +.Loop_enter4x: + movdqa %xmm6,32(%rsp) + movdqa %xmm7,48(%rsp) + movdqa (%r10),%xmm7 + movl $10,%eax + movdqa %xmm0,256-256(%rcx) + jmp .Loop4x + +.align 32 +.Loop4x: + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 +.byte 102,15,56,0,199 +.byte 102,15,56,0,207 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm6 + pslld $12,%xmm12 + psrld $20,%xmm6 + movdqa %xmm13,%xmm7 + pslld $12,%xmm13 + por %xmm6,%xmm12 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm13 + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 +.byte 102,15,56,0,198 +.byte 102,15,56,0,206 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm7 + pslld $7,%xmm12 + psrld $25,%xmm7 + movdqa %xmm13,%xmm6 + pslld $7,%xmm13 + por %xmm7,%xmm12 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm13 + movdqa %xmm4,0(%rsp) + movdqa %xmm5,16(%rsp) + movdqa 32(%rsp),%xmm4 + movdqa 48(%rsp),%xmm5 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 +.byte 102,15,56,0,215 +.byte 102,15,56,0,223 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm6 + pslld $12,%xmm14 + psrld $20,%xmm6 + movdqa %xmm15,%xmm7 + pslld $12,%xmm15 + por %xmm6,%xmm14 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm15 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 +.byte 102,15,56,0,214 +.byte 102,15,56,0,222 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm7 + pslld $7,%xmm14 + psrld $25,%xmm7 + movdqa %xmm15,%xmm6 + pslld $7,%xmm15 + por %xmm7,%xmm14 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm15 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 +.byte 102,15,56,0,223 +.byte 102,15,56,0,199 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm6 + pslld $12,%xmm13 + psrld $20,%xmm6 + movdqa %xmm14,%xmm7 + pslld $12,%xmm14 + por %xmm6,%xmm13 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm14 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 +.byte 102,15,56,0,222 +.byte 102,15,56,0,198 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm7 + pslld $7,%xmm13 + psrld $25,%xmm7 + movdqa %xmm14,%xmm6 + pslld $7,%xmm14 + por %xmm7,%xmm13 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm14 + movdqa %xmm4,32(%rsp) + movdqa %xmm5,48(%rsp) + movdqa 0(%rsp),%xmm4 + movdqa 16(%rsp),%xmm5 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm6 + pslld $12,%xmm15 + psrld $20,%xmm6 + movdqa %xmm12,%xmm7 + pslld $12,%xmm12 + por %xmm6,%xmm15 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm12 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 +.byte 102,15,56,0,206 +.byte 102,15,56,0,214 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm7 + pslld $7,%xmm15 + psrld $25,%xmm7 + movdqa %xmm12,%xmm6 + pslld $7,%xmm12 + por %xmm7,%xmm15 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm12 + decl %eax + jnz .Loop4x + + paddd 64(%rsp),%xmm8 + paddd 80(%rsp),%xmm9 + paddd 96(%rsp),%xmm10 + paddd 112(%rsp),%xmm11 + + movdqa %xmm8,%xmm6 + punpckldq %xmm9,%xmm8 + movdqa %xmm10,%xmm7 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm9,%xmm6 + punpckhdq %xmm11,%xmm7 + movdqa %xmm8,%xmm9 + punpcklqdq %xmm10,%xmm8 + movdqa %xmm6,%xmm11 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm10,%xmm9 + punpckhqdq %xmm7,%xmm11 + paddd 128-256(%rcx),%xmm12 + paddd 144-256(%rcx),%xmm13 + paddd 160-256(%rcx),%xmm14 + paddd 176-256(%rcx),%xmm15 + + movdqa %xmm8,0(%rsp) + movdqa %xmm9,16(%rsp) + movdqa 32(%rsp),%xmm8 + movdqa 48(%rsp),%xmm9 + + movdqa %xmm12,%xmm10 + punpckldq %xmm13,%xmm12 + movdqa %xmm14,%xmm7 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm13,%xmm10 + punpckhdq %xmm15,%xmm7 + movdqa %xmm12,%xmm13 + punpcklqdq %xmm14,%xmm12 + movdqa %xmm10,%xmm15 + punpcklqdq %xmm7,%xmm10 + punpckhqdq %xmm14,%xmm13 + punpckhqdq %xmm7,%xmm15 + paddd 192-256(%rcx),%xmm4 + paddd 208-256(%rcx),%xmm5 + paddd 224-256(%rcx),%xmm8 + paddd 240-256(%rcx),%xmm9 + + movdqa %xmm6,32(%rsp) + movdqa %xmm11,48(%rsp) + + movdqa %xmm4,%xmm14 + punpckldq %xmm5,%xmm4 + movdqa %xmm8,%xmm7 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm5,%xmm14 + punpckhdq %xmm9,%xmm7 + movdqa %xmm4,%xmm5 + punpcklqdq %xmm8,%xmm4 + movdqa %xmm14,%xmm9 + punpcklqdq %xmm7,%xmm14 + punpckhqdq %xmm8,%xmm5 + punpckhqdq %xmm7,%xmm9 + paddd 256-256(%rcx),%xmm0 + paddd 272-256(%rcx),%xmm1 + paddd 288-256(%rcx),%xmm2 + paddd 304-256(%rcx),%xmm3 + + movdqa %xmm0,%xmm8 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm8 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm8,%xmm3 + punpcklqdq %xmm7,%xmm8 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + cmpq $256,%rdx + jb .Ltail4x + + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 48(%rsp),%xmm6 + pxor %xmm15,%xmm11 + pxor %xmm9,%xmm2 + pxor %xmm3,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + + subq $256,%rdx + jnz .Loop_outer4x + + jmp .Ldone4x + +.Ltail4x: + cmpq $192,%rdx + jae .L192_or_more4x + cmpq $128,%rdx + jae .L128_or_more4x + cmpq $64,%rdx + jae .L64_or_more4x + + + xorq %r10,%r10 + + movdqa %xmm12,16(%rsp) + movdqa %xmm4,32(%rsp) + movdqa %xmm0,48(%rsp) + jmp .Loop_tail4x + +.align 32 +.L64_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je .Ldone4x + + movdqa 16(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm13,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm5,32(%rsp) + subq $64,%rdx + movdqa %xmm1,48(%rsp) + jmp .Loop_tail4x + +.align 32 +.L128_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + je .Ldone4x + + movdqa 32(%rsp),%xmm6 + leaq 128(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm10,16(%rsp) + leaq 128(%rdi),%rdi + movdqa %xmm14,32(%rsp) + subq $128,%rdx + movdqa %xmm8,48(%rsp) + jmp .Loop_tail4x + +.align 32 +.L192_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je .Ldone4x + + movdqa 48(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm15,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm9,32(%rsp) + subq $192,%rdx + movdqa %xmm3,48(%rsp) + +.Loop_tail4x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail4x + +.Ldone4x: + addq $0x148+0,%rsp + .byte 0xf3,0xc3 +.size ChaCha20_4x,.-ChaCha20_4x +.type ChaCha20_8x,@function +.align 32 +ChaCha20_8x: +.LChaCha20_8x: + movq %rsp,%r10 + subq $0x280+8,%rsp + andq $-32,%rsp + vzeroupper + movq %r10,640(%rsp) + + + + + + + + + + + vbroadcasti128 .Lsigma(%rip),%ymm11 + vbroadcasti128 (%rcx),%ymm3 + vbroadcasti128 16(%rcx),%ymm15 + vbroadcasti128 (%r8),%ymm7 + leaq 256(%rsp),%rcx + leaq 512(%rsp),%rax + leaq .Lrot16(%rip),%r10 + leaq .Lrot24(%rip),%r11 + + vpshufd $0x00,%ymm11,%ymm8 + vpshufd $0x55,%ymm11,%ymm9 + vmovdqa %ymm8,128-256(%rcx) + vpshufd $0xaa,%ymm11,%ymm10 + vmovdqa %ymm9,160-256(%rcx) + vpshufd $0xff,%ymm11,%ymm11 + vmovdqa %ymm10,192-256(%rcx) + vmovdqa %ymm11,224-256(%rcx) + + vpshufd $0x00,%ymm3,%ymm0 + vpshufd $0x55,%ymm3,%ymm1 + vmovdqa %ymm0,256-256(%rcx) + vpshufd $0xaa,%ymm3,%ymm2 + vmovdqa %ymm1,288-256(%rcx) + vpshufd $0xff,%ymm3,%ymm3 + vmovdqa %ymm2,320-256(%rcx) + vmovdqa %ymm3,352-256(%rcx) + + vpshufd $0x00,%ymm15,%ymm12 + vpshufd $0x55,%ymm15,%ymm13 + vmovdqa %ymm12,384-512(%rax) + vpshufd $0xaa,%ymm15,%ymm14 + vmovdqa %ymm13,416-512(%rax) + vpshufd $0xff,%ymm15,%ymm15 + vmovdqa %ymm14,448-512(%rax) + vmovdqa %ymm15,480-512(%rax) + + vpshufd $0x00,%ymm7,%ymm4 + vpshufd $0x55,%ymm7,%ymm5 + vpaddd .Lincy(%rip),%ymm4,%ymm4 + vpshufd $0xaa,%ymm7,%ymm6 + vmovdqa %ymm5,544-512(%rax) + vpshufd $0xff,%ymm7,%ymm7 + vmovdqa %ymm6,576-512(%rax) + vmovdqa %ymm7,608-512(%rax) + + jmp .Loop_enter8x + +.align 32 +.Loop_outer8x: + vmovdqa 128-256(%rcx),%ymm8 + vmovdqa 160-256(%rcx),%ymm9 + vmovdqa 192-256(%rcx),%ymm10 + vmovdqa 224-256(%rcx),%ymm11 + vmovdqa 256-256(%rcx),%ymm0 + vmovdqa 288-256(%rcx),%ymm1 + vmovdqa 320-256(%rcx),%ymm2 + vmovdqa 352-256(%rcx),%ymm3 + vmovdqa 384-512(%rax),%ymm12 + vmovdqa 416-512(%rax),%ymm13 + vmovdqa 448-512(%rax),%ymm14 + vmovdqa 480-512(%rax),%ymm15 + vmovdqa 512-512(%rax),%ymm4 + vmovdqa 544-512(%rax),%ymm5 + vmovdqa 576-512(%rax),%ymm6 + vmovdqa 608-512(%rax),%ymm7 + vpaddd .Leight(%rip),%ymm4,%ymm4 + +.Loop_enter8x: + vmovdqa %ymm14,64(%rsp) + vmovdqa %ymm15,96(%rsp) + vbroadcasti128 (%r10),%ymm15 + vmovdqa %ymm4,512-512(%rax) + movl $10,%eax + jmp .Loop8x + +.align 32 +.Loop8x: + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $12,%ymm0,%ymm14 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $12,%ymm1,%ymm15 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $7,%ymm0,%ymm15 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $7,%ymm1,%ymm14 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vmovdqa %ymm12,0(%rsp) + vmovdqa %ymm13,32(%rsp) + vmovdqa 64(%rsp),%ymm12 + vmovdqa 96(%rsp),%ymm13 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $12,%ymm2,%ymm14 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $12,%ymm3,%ymm15 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $7,%ymm2,%ymm15 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $7,%ymm3,%ymm14 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $12,%ymm1,%ymm14 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $12,%ymm2,%ymm15 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $7,%ymm1,%ymm15 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $7,%ymm2,%ymm14 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vmovdqa %ymm12,64(%rsp) + vmovdqa %ymm13,96(%rsp) + vmovdqa 0(%rsp),%ymm12 + vmovdqa 32(%rsp),%ymm13 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $12,%ymm3,%ymm14 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $12,%ymm0,%ymm15 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $7,%ymm3,%ymm15 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $7,%ymm0,%ymm14 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + decl %eax + jnz .Loop8x + + leaq 512(%rsp),%rax + vpaddd 128-256(%rcx),%ymm8,%ymm8 + vpaddd 160-256(%rcx),%ymm9,%ymm9 + vpaddd 192-256(%rcx),%ymm10,%ymm10 + vpaddd 224-256(%rcx),%ymm11,%ymm11 + + vpunpckldq %ymm9,%ymm8,%ymm14 + vpunpckldq %ymm11,%ymm10,%ymm15 + vpunpckhdq %ymm9,%ymm8,%ymm8 + vpunpckhdq %ymm11,%ymm10,%ymm10 + vpunpcklqdq %ymm15,%ymm14,%ymm9 + vpunpckhqdq %ymm15,%ymm14,%ymm14 + vpunpcklqdq %ymm10,%ymm8,%ymm11 + vpunpckhqdq %ymm10,%ymm8,%ymm8 + vpaddd 256-256(%rcx),%ymm0,%ymm0 + vpaddd 288-256(%rcx),%ymm1,%ymm1 + vpaddd 320-256(%rcx),%ymm2,%ymm2 + vpaddd 352-256(%rcx),%ymm3,%ymm3 + + vpunpckldq %ymm1,%ymm0,%ymm10 + vpunpckldq %ymm3,%ymm2,%ymm15 + vpunpckhdq %ymm1,%ymm0,%ymm0 + vpunpckhdq %ymm3,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm10,%ymm1 + vpunpckhqdq %ymm15,%ymm10,%ymm10 + vpunpcklqdq %ymm2,%ymm0,%ymm3 + vpunpckhqdq %ymm2,%ymm0,%ymm0 + vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 + vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 + vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 + vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 + vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 + vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 + vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 + vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 + vmovdqa %ymm15,0(%rsp) + vmovdqa %ymm9,32(%rsp) + vmovdqa 64(%rsp),%ymm15 + vmovdqa 96(%rsp),%ymm9 + + vpaddd 384-512(%rax),%ymm12,%ymm12 + vpaddd 416-512(%rax),%ymm13,%ymm13 + vpaddd 448-512(%rax),%ymm15,%ymm15 + vpaddd 480-512(%rax),%ymm9,%ymm9 + + vpunpckldq %ymm13,%ymm12,%ymm2 + vpunpckldq %ymm9,%ymm15,%ymm8 + vpunpckhdq %ymm13,%ymm12,%ymm12 + vpunpckhdq %ymm9,%ymm15,%ymm15 + vpunpcklqdq %ymm8,%ymm2,%ymm13 + vpunpckhqdq %ymm8,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm12,%ymm9 + vpunpckhqdq %ymm15,%ymm12,%ymm12 + vpaddd 512-512(%rax),%ymm4,%ymm4 + vpaddd 544-512(%rax),%ymm5,%ymm5 + vpaddd 576-512(%rax),%ymm6,%ymm6 + vpaddd 608-512(%rax),%ymm7,%ymm7 + + vpunpckldq %ymm5,%ymm4,%ymm15 + vpunpckldq %ymm7,%ymm6,%ymm8 + vpunpckhdq %ymm5,%ymm4,%ymm4 + vpunpckhdq %ymm7,%ymm6,%ymm6 + vpunpcklqdq %ymm8,%ymm15,%ymm5 + vpunpckhqdq %ymm8,%ymm15,%ymm15 + vpunpcklqdq %ymm6,%ymm4,%ymm7 + vpunpckhqdq %ymm6,%ymm4,%ymm4 + vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 + vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 + vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 + vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 + vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 + vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 + vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 + vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 + vmovdqa 0(%rsp),%ymm6 + vmovdqa 32(%rsp),%ymm12 + + cmpq $512,%rdx + jb .Ltail8x + + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + leaq 128(%rsi),%rsi + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm12,%ymm12 + vpxor 32(%rsi),%ymm13,%ymm13 + vpxor 64(%rsi),%ymm10,%ymm10 + vpxor 96(%rsi),%ymm15,%ymm15 + leaq 128(%rsi),%rsi + vmovdqu %ymm12,0(%rdi) + vmovdqu %ymm13,32(%rdi) + vmovdqu %ymm10,64(%rdi) + vmovdqu %ymm15,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm14,%ymm14 + vpxor 32(%rsi),%ymm2,%ymm2 + vpxor 64(%rsi),%ymm3,%ymm3 + vpxor 96(%rsi),%ymm7,%ymm7 + leaq 128(%rsi),%rsi + vmovdqu %ymm14,0(%rdi) + vmovdqu %ymm2,32(%rdi) + vmovdqu %ymm3,64(%rdi) + vmovdqu %ymm7,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm11,%ymm11 + vpxor 32(%rsi),%ymm9,%ymm9 + vpxor 64(%rsi),%ymm0,%ymm0 + vpxor 96(%rsi),%ymm4,%ymm4 + leaq 128(%rsi),%rsi + vmovdqu %ymm11,0(%rdi) + vmovdqu %ymm9,32(%rdi) + vmovdqu %ymm0,64(%rdi) + vmovdqu %ymm4,96(%rdi) + leaq 128(%rdi),%rdi + + subq $512,%rdx + jnz .Loop_outer8x + + jmp .Ldone8x + +.Ltail8x: + cmpq $448,%rdx + jae .L448_or_more8x + cmpq $384,%rdx + jae .L384_or_more8x + cmpq $320,%rdx + jae .L320_or_more8x + cmpq $256,%rdx + jae .L256_or_more8x + cmpq $192,%rdx + jae .L192_or_more8x + cmpq $128,%rdx + jae .L128_or_more8x + cmpq $64,%rdx + jae .L64_or_more8x + + xorq %r10,%r10 + vmovdqa %ymm6,0(%rsp) + vmovdqa %ymm8,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L64_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + je .Ldone8x + + leaq 64(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm1,0(%rsp) + leaq 64(%rdi),%rdi + subq $64,%rdx + vmovdqa %ymm5,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L128_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + je .Ldone8x + + leaq 128(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm12,0(%rsp) + leaq 128(%rdi),%rdi + subq $128,%rdx + vmovdqa %ymm13,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L192_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + je .Ldone8x + + leaq 192(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm10,0(%rsp) + leaq 192(%rdi),%rdi + subq $192,%rdx + vmovdqa %ymm15,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L256_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + je .Ldone8x + + leaq 256(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm14,0(%rsp) + leaq 256(%rdi),%rdi + subq $256,%rdx + vmovdqa %ymm2,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L320_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + je .Ldone8x + + leaq 320(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm3,0(%rsp) + leaq 320(%rdi),%rdi + subq $320,%rdx + vmovdqa %ymm7,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L384_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + je .Ldone8x + + leaq 384(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm11,0(%rsp) + leaq 384(%rdi),%rdi + subq $384,%rdx + vmovdqa %ymm9,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L448_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vpxor 384(%rsi),%ymm11,%ymm11 + vpxor 416(%rsi),%ymm9,%ymm9 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + vmovdqu %ymm11,384(%rdi) + vmovdqu %ymm9,416(%rdi) + je .Ldone8x + + leaq 448(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm0,0(%rsp) + leaq 448(%rdi),%rdi + subq $448,%rdx + vmovdqa %ymm4,32(%rsp) + +.Loop_tail8x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail8x + +.Ldone8x: + vzeroall + movq 640(%rsp),%rsp + .byte 0xf3,0xc3 +.size ChaCha20_8x,.-ChaCha20_8x +#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/ec/p256-x86_64-asm.S b/packager/third_party/boringssl/linux-x86_64/crypto/ec/p256-x86_64-asm.S new file mode 100644 index 0000000000..4abce6f91e --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/ec/p256-x86_64-asm.S @@ -0,0 +1,1789 @@ +#if defined(__x86_64__) +.text +.extern OPENSSL_ia32cap_P +.hidden OPENSSL_ia32cap_P + + +.align 64 +.Lpoly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + +.LOne: +.long 1,1,1,1,1,1,1,1 +.LTwo: +.long 2,2,2,2,2,2,2,2 +.LThree: +.long 3,3,3,3,3,3,3,3 +.LONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + +.type ecp_nistz256_mul_by_2,@function +.align 64 +ecp_nistz256_mul_by_2: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + addq %r8,%r8 + movq 16(%rsi),%r10 + adcq %r9,%r9 + movq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + movq %r8,%rax + adcq %r10,%r10 + adcq %r11,%r11 + movq %r9,%rdx + sbbq %r13,%r13 + + subq 0(%rsi),%r8 + movq %r10,%rcx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r12 + sbbq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + + + +.globl ecp_nistz256_neg +.hidden ecp_nistz256_neg +.type ecp_nistz256_neg,@function +.align 32 +ecp_nistz256_neg: + pushq %r12 + pushq %r13 + + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r13,%r13 + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r8,%rax + sbbq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_neg,.-ecp_nistz256_neg + + + + + + +.globl ecp_nistz256_mul_mont +.hidden ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,@function +.align 32 +ecp_nistz256_mul_mont: +.Lmul_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx,%rbx + movq 0(%rdx),%rax + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + call __ecp_nistz256_mul_montq +.Lmul_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +.type __ecp_nistz256_mul_montq,@function +.align 32 +__ecp_nistz256_mul_montq: + + + movq %rax,%rbp + mulq %r9 + movq .Lpoly+8(%rip),%r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r10 + movq .Lpoly+24(%rip),%r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r11 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + xorq %r13,%r13 + movq %rdx,%r12 + + + + + + + + + + + movq %r8,%rbp + shlq $32,%r8 + mulq %r15 + shrq $32,%rbp + addq %r8,%r9 + adcq %rbp,%r10 + adcq %rax,%r11 + movq 8(%rbx),%rax + adcq %rdx,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + + movq %r9,%rbp + shlq $32,%r9 + mulq %r15 + shrq $32,%rbp + addq %r9,%r10 + adcq %rbp,%r11 + adcq %rax,%r12 + movq 16(%rbx),%rax + adcq %rdx,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + + movq %r10,%rbp + shlq $32,%r10 + mulq %r15 + shrq $32,%rbp + addq %r10,%r11 + adcq %rbp,%r12 + adcq %rax,%r13 + movq 24(%rbx),%rax + adcq %rdx,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + + movq %r11,%rbp + shlq $32,%r11 + mulq %r15 + shrq $32,%rbp + addq %r11,%r12 + adcq %rbp,%r13 + movq %r12,%rcx + adcq %rax,%r8 + adcq %rdx,%r9 + movq %r13,%rbp + adcq $0,%r10 + + + + subq $-1,%r12 + movq %r8,%rbx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rdx + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rcx,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rbx,%r8 + movq %r13,8(%rdi) + cmovcq %rdx,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq + + + + + + + + +.globl ecp_nistz256_sqr_mont +.hidden ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,@function +.align 32 +ecp_nistz256_sqr_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq 0(%rsi),%rax + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + + call __ecp_nistz256_sqr_montq +.Lsqr_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +.type __ecp_nistz256_sqr_montq,@function +.align 32 +__ecp_nistz256_sqr_montq: + movq %rax,%r13 + mulq %r14 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + + mulq %r13 + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r13 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %r14 + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + + mulq %r15 + xorq %r15,%r15 + addq %rax,%r13 + movq 0(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + mulq %rax + movq %rax,%r8 + movq 8(%rsi),%rax + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r9 + adcq %rax,%r10 + movq 16(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r11 + adcq %rax,%r12 + movq 24(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r13 + adcq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r15 + + movq .Lpoly+8(%rip),%rsi + movq .Lpoly+24(%rip),%rbp + + + + + movq %r8,%rcx + shlq $32,%r8 + mulq %rbp + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %rbp + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %rbp + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %rbp + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + adcq %rax,%r10 + adcq $0,%rdx + xorq %r11,%r11 + + + + addq %r8,%r12 + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %rdx,%r15 + movq %r13,%r9 + adcq $0,%r11 + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%rcx + sbbq %rbp,%r15 + sbbq $0,%r11 + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %rcx,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq + + + + + + +.globl ecp_nistz256_from_mont +.hidden ecp_nistz256_from_mont +.type ecp_nistz256_from_mont,@function +.align 32 +ecp_nistz256_from_mont: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%rax + movq .Lpoly+24(%rip),%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %rax,%r8 + movq .Lpoly+8(%rip),%r12 + + + + movq %rax,%rcx + shlq $32,%r8 + mulq %r13 + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %r13 + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %r13 + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %r13 + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + movq %r8,%rcx + adcq %rax,%r10 + movq %r9,%rsi + adcq $0,%rdx + + subq $-1,%r8 + movq %r10,%rax + sbbq %r12,%r9 + sbbq $0,%r10 + movq %rdx,%r11 + sbbq %r13,%rdx + sbbq %r13,%r13 + + cmovnzq %rcx,%r8 + cmovnzq %rsi,%r9 + movq %r8,0(%rdi) + cmovnzq %rax,%r10 + movq %r9,8(%rdi) + cmovzq %rdx,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 +.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + + +.globl ecp_nistz256_select_w5 +.hidden ecp_nistz256_select_w5 +.type ecp_nistz256_select_w5,@function +.align 32 +ecp_nistz256_select_w5: + movdqa .LOne(%rip),%xmm0 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + + movdqa %xmm0,%xmm8 + pshufd $0,%xmm1,%xmm1 + + movq $16,%rax +.Lselect_loop_sse_w5: + + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + pcmpeqd %xmm1,%xmm15 + + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + movdqa 64(%rsi),%xmm13 + movdqa 80(%rsi),%xmm14 + leaq 96(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + pand %xmm15,%xmm13 + por %xmm12,%xmm5 + pand %xmm15,%xmm14 + por %xmm13,%xmm6 + por %xmm14,%xmm7 + + decq %rax + jnz .Lselect_loop_sse_w5 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + movdqu %xmm6,64(%rdi) + movdqu %xmm7,80(%rdi) + .byte 0xf3,0xc3 +.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 + + + +.globl ecp_nistz256_select_w7 +.hidden ecp_nistz256_select_w7 +.type ecp_nistz256_select_w7,@function +.align 32 +ecp_nistz256_select_w7: + movdqa .LOne(%rip),%xmm8 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + movdqa %xmm8,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq $64,%rax + +.Lselect_loop_sse_w7: + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + pcmpeqd %xmm1,%xmm15 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 + + decq %rax + jnz .Lselect_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + .byte 0xf3,0xc3 +.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +.globl ecp_nistz256_avx2_select_w7 +.hidden ecp_nistz256_avx2_select_w7 +.type ecp_nistz256_avx2_select_w7,@function +.align 32 +ecp_nistz256_avx2_select_w7: +.byte 0x0f,0x0b + .byte 0xf3,0xc3 +.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 +.type __ecp_nistz256_add_toq,@function +.align 32 +__ecp_nistz256_add_toq: + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq + +.type __ecp_nistz256_sub_fromq,@function +.align 32 +__ecp_nistz256_sub_fromq: + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq + +.type __ecp_nistz256_subq,@function +.align 32 +__ecp_nistz256_subq: + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + .byte 0xf3,0xc3 +.size __ecp_nistz256_subq,.-__ecp_nistz256_subq + +.type __ecp_nistz256_mul_by_2q,@function +.align 32 +__ecp_nistz256_mul_by_2q: + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q +.globl ecp_nistz256_point_double +.hidden ecp_nistz256_point_double +.type ecp_nistz256_point_double,@function +.align 32 +ecp_nistz256_point_double: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $160+8,%rsp + +.Lpoint_double_shortcutq: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + addq $160+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +.globl ecp_nistz256_point_add +.hidden ecp_nistz256_point_add +.type ecp_nistz256_point_add,@function +.align 32 +ecp_nistz256_point_add: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576+8,%rsp + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm3,%xmm5 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 0x3e + jnz .Ladd_proceedq +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + testq %r8,%r8 + jnz .Ladd_proceedq + testq %r9,%r9 + jz .Ladd_doubleq + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_doneq + +.align 32 +.Ladd_doubleq: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + jmp .Lpoint_double_shortcutq + +.align 32 +.Ladd_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_doneq: + addq $576+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +.globl ecp_nistz256_point_add_affine +.hidden ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,@function +.align 32 +ecp_nistz256_point_add_affine: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $480+8,%rsp + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm3,%xmm5 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + addq $480+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/md5/md5-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/md5/md5-x86_64.S index 7644689644..05369e2a77 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/md5/md5-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/md5/md5-x86_64.S @@ -495,14 +495,14 @@ md5_block_asm_data_order: movl %ecx,%r11d addl %ecx,%ebx movl 0(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d xorl %edx,%r11d leal -198630844(%rax,%r10,1),%eax orl %ebx,%r11d xorl %ecx,%r11d addl %r11d,%eax movl 28(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -511,7 +511,7 @@ md5_block_asm_data_order: xorl %ebx,%r11d addl %r11d,%edx movl 56(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -520,7 +520,7 @@ md5_block_asm_data_order: xorl %eax,%r11d addl %r11d,%ecx movl 20(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -529,7 +529,7 @@ md5_block_asm_data_order: xorl %edx,%r11d addl %r11d,%ebx movl 48(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx @@ -538,7 +538,7 @@ md5_block_asm_data_order: xorl %ecx,%r11d addl %r11d,%eax movl 12(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -547,7 +547,7 @@ md5_block_asm_data_order: xorl %ebx,%r11d addl %r11d,%edx movl 40(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -556,7 +556,7 @@ md5_block_asm_data_order: xorl %eax,%r11d addl %r11d,%ecx movl 4(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -565,7 +565,7 @@ md5_block_asm_data_order: xorl %edx,%r11d addl %r11d,%ebx movl 32(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx @@ -574,7 +574,7 @@ md5_block_asm_data_order: xorl %ecx,%r11d addl %r11d,%eax movl 60(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -583,7 +583,7 @@ md5_block_asm_data_order: xorl %ebx,%r11d addl %r11d,%edx movl 24(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -592,7 +592,7 @@ md5_block_asm_data_order: xorl %eax,%r11d addl %r11d,%ecx movl 52(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -601,7 +601,7 @@ md5_block_asm_data_order: xorl %edx,%r11d addl %r11d,%ebx movl 16(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx @@ -610,7 +610,7 @@ md5_block_asm_data_order: xorl %ecx,%r11d addl %r11d,%eax movl 44(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -619,7 +619,7 @@ md5_block_asm_data_order: xorl %ebx,%r11d addl %r11d,%edx movl 8(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -628,7 +628,7 @@ md5_block_asm_data_order: xorl %eax,%r11d addl %r11d,%ecx movl 36(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -637,7 +637,7 @@ md5_block_asm_data_order: xorl %edx,%r11d addl %r11d,%ebx movl 0(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/modes/ghash-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/modes/ghash-x86_64.S index 1db7d69d4b..b47bdc9bd9 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/modes/ghash-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/modes/ghash-x86_64.S @@ -23,14 +23,14 @@ gcm_gmult_4bit: movq $14,%rcx movq 8(%rsi,%rax,1),%r8 movq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl movq %r8,%rdx jmp .Loop1 .align 16 .Loop1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 movb (%rdi,%rcx,1),%al shrq $4,%r9 @@ -46,13 +46,13 @@ gcm_gmult_4bit: js .Lbreak1 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 @@ -61,19 +61,19 @@ gcm_gmult_4bit: .align 16 .Lbreak1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rbx,1),%r8 @@ -881,20 +881,20 @@ gcm_ghash_clmul: movdqu 32(%rsi),%xmm7 .byte 102,65,15,56,0,194 - subq $16,%rcx + subq $0x10,%rcx jz .Lodd_tail movdqu 16(%rsi),%xmm6 movl OPENSSL_ia32cap_P+4(%rip),%eax - cmpq $48,%rcx + cmpq $0x30,%rcx jb .Lskip4x andl $71303168,%eax cmpl $4194304,%eax je .Lskip4x - subq $48,%rcx - movq $11547335547999543296,%rax + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 @@ -941,7 +941,7 @@ gcm_ghash_clmul: xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jc .Ltail4x jmp .Lmod4_loop @@ -1024,7 +1024,7 @@ gcm_ghash_clmul: xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jnc .Lmod4_loop .Ltail4x: @@ -1068,10 +1068,10 @@ gcm_ghash_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - addq $64,%rcx + addq $0x40,%rcx jz .Ldone movdqu 32(%rsi),%xmm7 - subq $16,%rcx + subq $0x10,%rcx jz .Lodd_tail .Lskip4x: @@ -1094,7 +1094,7 @@ gcm_ghash_clmul: leaq 32(%rdx),%rdx nop - subq $32,%rcx + subq $0x20,%rcx jbe .Leven_tail nop jmp .Lmod_loop @@ -1157,7 +1157,7 @@ gcm_ghash_clmul: .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 - subq $32,%rcx + subq $0x20,%rcx ja .Lmod_loop .Leven_tail: diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha1-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha1-x86_64.S index 7668c2b1f5..d830b534de 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha1-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha1-x86_64.S @@ -13,6 +13,11 @@ sha1_block_data_order: movl OPENSSL_ia32cap_P+8(%rip),%r10d testl $512,%r8d jz .Lialu + andl $268435456,%r8d + andl $1073741824,%r9d + orl %r9d,%r8d + cmpl $1342177280,%r8d + je _avx_shortcut jmp _ssse3_shortcut .align 16 @@ -2408,6 +2413,1122 @@ _ssse3_shortcut: .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 +.type sha1_block_data_order_avx,@function +.align 16 +sha1_block_data_order_avx: +_avx_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp + vzeroupper + movq %rax,%r14 + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r11 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r11),%xmm6 + vmovdqa -64(%r11),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp .Loop_avx +.align 16 +.Loop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r11),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r11),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r11),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je .Ldone_avx + vmovdqa 64(%r11),%xmm6 + vmovdqa -64(%r11),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp .Loop_avx + +.align 16 +.Ldone_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + leaq (%r14),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.size sha1_block_data_order_avx,.-sha1_block_data_order_avx .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha256-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha256-x86_64.S index f526de51ad..445b497e88 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha256-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha256-x86_64.S @@ -12,6 +12,11 @@ sha256_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut testl $512,%r10d jnz .Lssse3_shortcut pushq %rbx @@ -2841,4 +2846,1061 @@ sha256_block_data_order_ssse3: .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 +.type sha256_block_data_order_avx,@function +.align 64 +sha256_block_data_order_avx: +.Lavx_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %r11,64+24(%rsp) +.Lprologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lavx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_avx + + movq 64+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.size sha256_block_data_order_avx,.-sha256_block_data_order_avx #endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha512-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha512-x86_64.S index ca3a3a1644..d65743fd52 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha512-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha512-x86_64.S @@ -8,6 +8,17 @@ .type sha512_block_data_order,@function .align 16 sha512_block_data_order: + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $2048,%r10d + jnz .Lxop_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut pushq %rbx pushq %rbp pushq %r12 @@ -1784,4 +1795,2234 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.type sha512_block_data_order_xop,@function +.align 64 +sha512_block_data_order_xop: +.Lxop_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %r11,128+24(%rsp) +.Lprologue_xop: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_xop +.align 16 +.Lloop_xop: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lxop_00_47 + +.align 16 +.Lxop_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm0,%xmm0 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,223,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm7,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm0,%xmm0 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm1,%xmm1 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,216,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm0,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm1,%xmm1 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm2,%xmm2 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,217,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm1,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm2,%xmm2 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm3,%xmm3 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,218,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm2,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm3,%xmm3 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm4,%xmm4 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,219,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm3,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm4,%xmm4 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm5,%xmm5 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,220,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm4,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm5,%xmm5 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm6,%xmm6 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,221,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm5,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm6,%xmm6 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm7,%xmm7 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,222,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm6,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm7,%xmm7 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lxop_00_47 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_xop + + movq 128+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_xop: + .byte 0xf3,0xc3 +.size sha512_block_data_order_xop,.-sha512_block_data_order_xop +.type sha512_block_data_order_avx,@function +.align 64 +sha512_block_data_order_avx: +.Lavx_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %r11,128+24(%rsp) +.Lprologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lavx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_avx + + movq 128+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.size sha512_block_data_order_avx,.-sha512_block_data_order_avx #endif diff --git a/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S b/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S new file mode 100644 index 0000000000..5de98a3d61 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S @@ -0,0 +1,969 @@ +#if defined(__i386__) +.file "chacha-x86.S" +.text +.globl _ChaCha20_ctr32 +.private_extern _ChaCha20_ctr32 +.align 4 +_ChaCha20_ctr32: +L_ChaCha20_ctr32_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + xorl %eax,%eax + cmpl 28(%esp),%eax + je L000no_data + call Lpic_point +Lpic_point: + popl %eax + movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lpic_point(%eax),%ebp + testl $16777216,(%ebp) + jz L001x86 + testl $512,4(%ebp) + jz L001x86 + jmp Lssse3_shortcut +L001x86: + movl 32(%esp),%esi + movl 36(%esp),%edi + subl $132,%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,80(%esp) + movl %ebx,84(%esp) + movl %ecx,88(%esp) + movl %edx,92(%esp) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + movl %eax,96(%esp) + movl %ebx,100(%esp) + movl %ecx,104(%esp) + movl %edx,108(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + subl $1,%eax + movl %eax,112(%esp) + movl %ebx,116(%esp) + movl %ecx,120(%esp) + movl %edx,124(%esp) + jmp L002entry +.align 4,0x90 +L003outer_loop: + movl %ebx,156(%esp) + movl %eax,152(%esp) + movl %ecx,160(%esp) +L002entry: + movl $1634760805,%eax + movl $857760878,4(%esp) + movl $2036477234,8(%esp) + movl $1797285236,12(%esp) + movl 84(%esp),%ebx + movl 88(%esp),%ebp + movl 104(%esp),%ecx + movl 108(%esp),%esi + movl 116(%esp),%edx + movl 120(%esp),%edi + movl %ebx,20(%esp) + movl %ebp,24(%esp) + movl %ecx,40(%esp) + movl %esi,44(%esp) + movl %edx,52(%esp) + movl %edi,56(%esp) + movl 92(%esp),%ebx + movl 124(%esp),%edi + movl 112(%esp),%edx + movl 80(%esp),%ebp + movl 96(%esp),%ecx + movl 100(%esp),%esi + addl $1,%edx + movl %ebx,28(%esp) + movl %edi,60(%esp) + movl %edx,112(%esp) + movl $10,%ebx + jmp L004loop +.align 4,0x90 +L004loop: + addl %ebp,%eax + movl %ebx,128(%esp) + movl %ebp,%ebx + xorl %eax,%edx + roll $16,%edx + addl %edx,%ecx + xorl %ecx,%ebx + movl 52(%esp),%edi + roll $12,%ebx + movl 20(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,48(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,32(%esp) + roll $16,%edi + movl %ebx,16(%esp) + addl %edi,%esi + movl 40(%esp),%ecx + xorl %esi,%ebp + movl 56(%esp),%edx + roll $12,%ebp + movl 24(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,52(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,36(%esp) + roll $16,%edx + movl %ebp,20(%esp) + addl %edx,%ecx + movl 44(%esp),%esi + xorl %ecx,%ebx + movl 60(%esp),%edi + roll $12,%ebx + movl 28(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,56(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,24(%esp) + addl %edi,%esi + xorl %esi,%ebp + roll $12,%ebp + movl 20(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,%edx + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + roll $16,%edx + movl %ebp,28(%esp) + addl %edx,%ecx + xorl %ecx,%ebx + movl 48(%esp),%edi + roll $12,%ebx + movl 24(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,60(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,40(%esp) + roll $16,%edi + movl %ebx,20(%esp) + addl %edi,%esi + movl 32(%esp),%ecx + xorl %esi,%ebp + movl 52(%esp),%edx + roll $12,%ebp + movl 28(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,48(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,44(%esp) + roll $16,%edx + movl %ebp,24(%esp) + addl %edx,%ecx + movl 36(%esp),%esi + xorl %ecx,%ebx + movl 56(%esp),%edi + roll $12,%ebx + movl 16(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,52(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,28(%esp) + addl %edi,%esi + xorl %esi,%ebp + movl 48(%esp),%edx + roll $12,%ebp + movl 128(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,56(%esp) + xorl %esi,%ebp + roll $7,%ebp + decl %ebx + jnz L004loop + movl 160(%esp),%ebx + addl $1634760805,%eax + addl 80(%esp),%ebp + addl 96(%esp),%ecx + addl 100(%esp),%esi + cmpl $64,%ebx + jb L005tail + movl 156(%esp),%ebx + addl 112(%esp),%edx + addl 120(%esp),%edi + xorl (%ebx),%eax + xorl 16(%ebx),%ebp + movl %eax,(%esp) + movl 152(%esp),%eax + xorl 32(%ebx),%ecx + xorl 36(%ebx),%esi + xorl 48(%ebx),%edx + xorl 56(%ebx),%edi + movl %ebp,16(%eax) + movl %ecx,32(%eax) + movl %esi,36(%eax) + movl %edx,48(%eax) + movl %edi,56(%eax) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + xorl 4(%ebx),%ebp + xorl 8(%ebx),%ecx + xorl 12(%ebx),%esi + xorl 20(%ebx),%edx + xorl 24(%ebx),%edi + movl %ebp,4(%eax) + movl %ecx,8(%eax) + movl %esi,12(%eax) + movl %edx,20(%eax) + movl %edi,24(%eax) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + xorl 28(%ebx),%ebp + xorl 40(%ebx),%ecx + xorl 44(%ebx),%esi + xorl 52(%ebx),%edx + xorl 60(%ebx),%edi + leal 64(%ebx),%ebx + movl %ebp,28(%eax) + movl (%esp),%ebp + movl %ecx,40(%eax) + movl 160(%esp),%ecx + movl %esi,44(%eax) + movl %edx,52(%eax) + movl %edi,60(%eax) + movl %ebp,(%eax) + leal 64(%eax),%eax + subl $64,%ecx + jnz L003outer_loop + jmp L006done +L005tail: + addl 112(%esp),%edx + addl 120(%esp),%edi + movl %eax,(%esp) + movl %ebp,16(%esp) + movl %ecx,32(%esp) + movl %esi,36(%esp) + movl %edx,48(%esp) + movl %edi,56(%esp) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + movl %ebp,4(%esp) + movl %ecx,8(%esp) + movl %esi,12(%esp) + movl %edx,20(%esp) + movl %edi,24(%esp) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + movl %ebp,28(%esp) + movl 156(%esp),%ebp + movl %ecx,40(%esp) + movl 152(%esp),%ecx + movl %esi,44(%esp) + xorl %esi,%esi + movl %edx,52(%esp) + movl %edi,60(%esp) + xorl %eax,%eax + xorl %edx,%edx +L007tail_loop: + movb (%esi,%ebp,1),%al + movb (%esp,%esi,1),%dl + leal 1(%esi),%esi + xorb %dl,%al + movb %al,-1(%ecx,%esi,1) + decl %ebx + jnz L007tail_loop +L006done: + addl $132,%esp +L000no_data: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _ChaCha20_ssse3 +.private_extern _ChaCha20_ssse3 +.align 4 +_ChaCha20_ssse3: +L_ChaCha20_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +Lssse3_shortcut: + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal Lssse3_data-Lpic_point(%eax),%eax + movdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb L0081x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + movdqu (%edx),%xmm7 + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + paddd 48(%eax),%xmm0 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + psubd 64(%eax),%xmm0 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,64(%ebp) + movdqa %xmm1,80(%ebp) + movdqa %xmm2,96(%ebp) + movdqa %xmm3,112(%ebp) + movdqu 16(%edx),%xmm3 + movdqa %xmm4,-64(%ebp) + movdqa %xmm5,-48(%ebp) + movdqa %xmm6,-32(%ebp) + movdqa %xmm7,-16(%ebp) + movdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,(%ebp) + movdqa %xmm1,16(%ebp) + movdqa %xmm2,32(%ebp) + movdqa %xmm3,48(%ebp) + movdqa %xmm4,-128(%ebp) + movdqa %xmm5,-112(%ebp) + movdqa %xmm6,-96(%ebp) + movdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp L009outer_loop +.align 4,0x90 +L009outer_loop: + movdqa -112(%ebp),%xmm1 + movdqa -96(%ebp),%xmm2 + movdqa -80(%ebp),%xmm3 + movdqa -48(%ebp),%xmm5 + movdqa -32(%ebp),%xmm6 + movdqa -16(%ebp),%xmm7 + movdqa %xmm1,-112(%ebx) + movdqa %xmm2,-96(%ebx) + movdqa %xmm3,-80(%ebx) + movdqa %xmm5,-48(%ebx) + movdqa %xmm6,-32(%ebx) + movdqa %xmm7,-16(%ebx) + movdqa 32(%ebp),%xmm2 + movdqa 48(%ebp),%xmm3 + movdqa 64(%ebp),%xmm4 + movdqa 80(%ebp),%xmm5 + movdqa 96(%ebp),%xmm6 + movdqa 112(%ebp),%xmm7 + paddd 64(%eax),%xmm4 + movdqa %xmm2,32(%ebx) + movdqa %xmm3,48(%ebx) + movdqa %xmm4,64(%ebx) + movdqa %xmm5,80(%ebx) + movdqa %xmm6,96(%ebx) + movdqa %xmm7,112(%ebx) + movdqa %xmm4,64(%ebp) + movdqa -128(%ebp),%xmm0 + movdqa %xmm4,%xmm6 + movdqa -64(%ebp),%xmm3 + movdqa (%ebp),%xmm4 + movdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 4,0x90 +L010loop: + paddd %xmm3,%xmm0 + movdqa %xmm3,%xmm2 + pxor %xmm0,%xmm6 + pshufb (%eax),%xmm6 + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -48(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 80(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,64(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-64(%ebx) + paddd %xmm7,%xmm5 + movdqa 32(%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -32(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 96(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,80(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,16(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-48(%ebx) + paddd %xmm6,%xmm4 + movdqa 48(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -16(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 112(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,96(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-32(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa -48(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,%xmm6 + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + pshufb (%eax),%xmm6 + movdqa %xmm3,-16(%ebx) + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -32(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 64(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,112(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,32(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-48(%ebx) + paddd %xmm7,%xmm5 + movdqa (%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -16(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 80(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,64(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,48(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-32(%ebx) + paddd %xmm6,%xmm4 + movdqa 16(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -64(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 96(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,80(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-16(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 64(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,96(%ebx) + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + por %xmm1,%xmm3 + decl %edx + jnz L010loop + movdqa %xmm3,-64(%ebx) + movdqa %xmm4,(%ebx) + movdqa %xmm5,16(%ebx) + movdqa %xmm6,64(%ebx) + movdqa %xmm7,96(%ebx) + movdqa -112(%ebx),%xmm1 + movdqa -96(%ebx),%xmm2 + movdqa -80(%ebx),%xmm3 + paddd -128(%ebp),%xmm0 + paddd -112(%ebp),%xmm1 + paddd -96(%ebp),%xmm2 + paddd -80(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa -64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa -48(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa -32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa -16(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd -64(%ebp),%xmm0 + paddd -48(%ebp),%xmm1 + paddd -32(%ebp),%xmm2 + paddd -16(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa (%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 16(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 48(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd (%ebp),%xmm0 + paddd 16(%ebp),%xmm1 + paddd 32(%ebp),%xmm2 + paddd 48(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa 64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 80(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 96(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 112(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd 64(%ebp),%xmm0 + paddd 80(%ebp),%xmm1 + paddd 96(%ebp),%xmm2 + paddd 112(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 208(%esi),%esi + pxor %xmm0,%xmm4 + pxor %xmm1,%xmm5 + pxor %xmm2,%xmm6 + pxor %xmm3,%xmm7 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc L009outer_loop + addl $256,%ecx + jz L011done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + movd 64(%ebp),%xmm2 + movdqu (%ebx),%xmm3 + paddd 96(%eax),%xmm2 + pand 112(%eax),%xmm3 + por %xmm2,%xmm3 +L0081x: + movdqa 32(%eax),%xmm0 + movdqu (%edx),%xmm1 + movdqu 16(%edx),%xmm2 + movdqa (%eax),%xmm6 + movdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movl $10,%edx + jmp L012loop1x +.align 4,0x90 +L013outer1x: + movdqa 80(%eax),%xmm3 + movdqa (%esp),%xmm0 + movdqa 16(%esp),%xmm1 + movdqa 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + movl $10,%edx + movdqa %xmm3,48(%esp) + jmp L012loop1x +.align 4,0x90 +L012loop1x: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decl %edx + jnz L012loop1x + paddd (%esp),%xmm0 + paddd 16(%esp),%xmm1 + paddd 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + cmpl $64,%ecx + jb L014tail + movdqu (%esi),%xmm4 + movdqu 16(%esi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%esi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%esi),%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + leal 64(%esi),%esi + movdqu %xmm0,(%edi) + movdqu %xmm1,16(%edi) + movdqu %xmm2,32(%edi) + movdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz L013outer1x + jmp L011done +L014tail: + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +L015tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz L015tail_loop +L011done: + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +Lssse3_data: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.long 1634760805,857760878,2036477234,1797285236 +.long 0,1,2,3 +.long 4,4,4,4 +.long 1,0,0,0 +.long 4,0,0,0 +.long 0,-1,-1,-1 +.align 6,0x90 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +.byte 114,103,62,0 +.section __IMPORT,__pointers,non_lazy_symbol_pointers +L_OPENSSL_ia32cap_P$non_lazy_ptr: +.indirect_symbol _OPENSSL_ia32cap_P +.long 0 +#endif diff --git a/packager/third_party/boringssl/mac-x86/crypto/rc4/rc4-586.S b/packager/third_party/boringssl/mac-x86/crypto/rc4/rc4-586.S index faecdfa9f2..dcddc58388 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/rc4/rc4-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/rc4/rc4-586.S @@ -343,39 +343,6 @@ L015exit: popl %ebx popl %ebp ret -.globl _RC4_options -.private_extern _RC4_options -.align 4 -_RC4_options: -L_RC4_options_begin: - call L018pic_point -L018pic_point: - popl %eax - leal L019opts-L018pic_point(%eax),%eax - call L020PIC_me_up -L020PIC_me_up: - popl %edx - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L020PIC_me_up(%edx),%edx - movl (%edx),%edx - btl $20,%edx - jc L0211xchar - btl $26,%edx - jnc L022ret - addl $25,%eax - ret -L0211xchar: - addl $12,%eax -L022ret: - ret -.align 6,0x90 -L019opts: -.byte 114,99,52,40,52,120,44,105,110,116,41,0 -.byte 114,99,52,40,49,120,44,99,104,97,114,41,0 -.byte 114,99,52,40,56,120,44,109,109,120,41,0 -.byte 82,67,52,32,102,111,114,32,120,56,54,44,32,67,82,89 -.byte 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114 -.byte 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 6,0x90 .section __IMPORT,__pointers,non_lazy_symbol_pointers L_OPENSSL_ia32cap_P$non_lazy_ptr: .indirect_symbol _OPENSSL_ia32cap_P diff --git a/packager/third_party/boringssl/mac-x86/crypto/sha/sha1-586.S b/packager/third_party/boringssl/mac-x86/crypto/sha/sha1-586.S index 97aafbf198..72a7205dea 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/sha/sha1-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/sha/sha1-586.S @@ -22,8 +22,11 @@ L000pic_point: movl 8(%esi),%ecx testl $16777216,%eax jz L001x86 - testl $536870912,%ecx - jnz Lshaext_shortcut + andl $268435456,%edx + andl $1073741824,%eax + orl %edx,%eax + cmpl $1342177280,%eax + je Lavx_shortcut jmp Lssse3_shortcut .align 4,0x90 L001x86: @@ -1391,9 +1394,9 @@ L002loop: popl %ebx popl %ebp ret -.private_extern __sha1_block_data_order_shaext +.private_extern __sha1_block_data_order_ssse3 .align 4 -__sha1_block_data_order_shaext: +__sha1_block_data_order_ssse3: pushl %ebp pushl %ebx pushl %esi @@ -1402,175 +1405,6 @@ __sha1_block_data_order_shaext: L003pic_point: popl %ebp leal LK_XX_XX-L003pic_point(%ebp),%ebp -Lshaext_shortcut: - movl 20(%esp),%edi - movl %esp,%ebx - movl 24(%esp),%esi - movl 28(%esp),%ecx - subl $32,%esp - movdqu (%edi),%xmm0 - movd 16(%edi),%xmm1 - andl $-32,%esp - movdqa 80(%ebp),%xmm3 - movdqu (%esi),%xmm4 - pshufd $27,%xmm0,%xmm0 - movdqu 16(%esi),%xmm5 - pshufd $27,%xmm1,%xmm1 - movdqu 32(%esi),%xmm6 -.byte 102,15,56,0,227 - movdqu 48(%esi),%xmm7 -.byte 102,15,56,0,235 -.byte 102,15,56,0,243 -.byte 102,15,56,0,251 - jmp L004loop_shaext -.align 4,0x90 -L004loop_shaext: - decl %ecx - leal 64(%esi),%eax - movdqa %xmm1,(%esp) - paddd %xmm4,%xmm1 - cmovnel %eax,%esi - movdqa %xmm0,16(%esp) -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 - movdqu (%esi),%xmm4 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,213 - movdqu 16(%esi),%xmm5 -.byte 102,15,56,0,227 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,206 - movdqu 32(%esi),%xmm6 -.byte 102,15,56,0,235 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,215 - movdqu 48(%esi),%xmm7 -.byte 102,15,56,0,243 - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 - movdqa (%esp),%xmm2 -.byte 102,15,56,0,251 -.byte 15,56,200,202 - paddd 16(%esp),%xmm0 - jnz L004loop_shaext - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 - movdqu %xmm0,(%edi) - movd %xmm1,16(%edi) - movl %ebx,%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.private_extern __sha1_block_data_order_ssse3 -.align 4 -__sha1_block_data_order_ssse3: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call L005pic_point -L005pic_point: - popl %ebp - leal LK_XX_XX-L005pic_point(%ebp),%ebp Lssse3_shortcut: movdqa (%ebp),%xmm7 movdqa 16(%ebp),%xmm0 @@ -1623,9 +1457,9 @@ Lssse3_shortcut: xorl %edx,%ebp pshufd $238,%xmm0,%xmm4 andl %ebp,%esi - jmp L006loop + jmp L004loop .align 4,0x90 -L006loop: +L004loop: rorl $2,%ebx xorl %edx,%esi movl %eax,%ebp @@ -2528,7 +2362,7 @@ L006loop: addl %edx,%ecx movl 196(%esp),%ebp cmpl 200(%esp),%ebp - je L007done + je L005done movdqa 160(%esp),%xmm7 movdqa 176(%esp),%xmm6 movdqu (%ebp),%xmm0 @@ -2663,9 +2497,9 @@ L006loop: pshufd $238,%xmm0,%xmm4 andl %ebx,%esi movl %ebp,%ebx - jmp L006loop + jmp L004loop .align 4,0x90 -L007done: +L005done: addl 16(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp @@ -2778,6 +2612,1175 @@ L007done: popl %ebx popl %ebp ret +.private_extern __sha1_block_data_order_avx +.align 4 +__sha1_block_data_order_avx: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call L006pic_point +L006pic_point: + popl %ebp + leal LK_XX_XX-L006pic_point(%ebp),%ebp +Lavx_shortcut: + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) + shll $6,%edx + vmovdqa %xmm7,160(%esp) + addl %ebp,%edx + vmovdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) + movl %ecx,%ebp + vmovdqa %xmm5,16(%esp) + xorl %edx,%ebp + vmovdqa %xmm6,32(%esp) + andl %ebp,%esi + jmp L007loop +.align 4,0x90 +L007loop: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%ebp + addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%edi + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) + movl %edi,%esi + addl 4(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%edi,%edi + addl %ebp,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm6 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 + movl %ecx,%esi + addl 12(%esp),%ebx + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 + addl %ebp,%ebx + andl %edx,%esi + vmovdqa 96(%esp),%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%ebp + addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vmovdqa %xmm0,(%esp) + movl %eax,%esi + addl 20(%esp),%edi + vpxor %xmm7,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %ebp,%edi + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm7 + xorl %ecx,%ebx + addl %eax,%edi + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 + movl %edx,%esi + addl 28(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 + addl %ebp,%ecx + andl %edi,%esi + vmovdqa 112(%esp),%xmm1 + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%ebp + addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 + xorl %edi,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx + xorl %edi,%ebp + vmovdqa %xmm1,16(%esp) + movl %ebx,%esi + addl 36(%esp),%eax + vpxor %xmm0,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + addl %ebp,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vpxor %xmm1,%xmm6,%xmm6 + movl %edi,%esi + addl 44(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 + addl %ebp,%edx + andl %eax,%esi + vmovdqa 112(%esp),%xmm2 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%ebp + addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 + addl %esi,%ecx + andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%edi + addl %edx,%ecx + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vmovdqa %xmm2,32(%esp) + movl %ecx,%esi + addl 52(%esp),%ebx + vpxor %xmm1,%xmm7,%xmm7 + xorl %edi,%edx + shldl $5,%ecx,%ecx + addl %ebp,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm1 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vpxor %xmm2,%xmm7,%xmm7 + movl %eax,%esi + addl 60(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 + addl %ebp,%edi + andl %ebx,%esi + vmovdqa 112(%esp),%xmm3 + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) + xorl %ebx,%eax + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + addl %esi,%edx + andl %eax,%ebp + vpxor %xmm2,%xmm0,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 + xorl %edi,%edx + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 + addl %esi,%ebx + andl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm3,%xmm1,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm3,%xmm1,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + vmovdqa 64(%esp),%xmm3 + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm4,%xmm2,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vpor %xmm4,%xmm2,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + vmovdqa 80(%esp),%xmm4 + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + vmovdqa 96(%esp),%xmm5 + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 + addl (%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi + addl %edx,%ecx + vpxor %xmm6,%xmm4,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm6,%xmm4,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + vmovdqa 64(%esp),%xmm6 + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + addl %edi,%edx + vpxor %xmm7,%xmm5,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm7,%xmm5,%xmm5 + addl 28(%esp),%eax + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax + addl %esi,%edi + vpxor %xmm0,%xmm6,%xmm6 + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 + movl %edx,%ebp + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 + movl %edi,%ebp + xorl %ebx,%esi + shldl $5,%edi,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + addl (%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm2,%xmm0,%xmm0 + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 + movl %eax,%ebp + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm3,%xmm1,%xmm1 + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 + movl %ebx,%ebp + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi + addl %esi,%edx + vpxor %xmm4,%xmm2,%xmm2 + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 + xorl %eax,%edi + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 + movl %ecx,%ebp + xorl %edi,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm7,48(%esp) + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je L008done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 + addl $64,%ebp + vpshufb %xmm6,%xmm0,%xmm0 + movl %ebp,196(%esp) + vmovdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,(%esp) + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%ebp + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vmovdqa %xmm5,16(%esp) + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %edi,%ebp + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vmovdqa %xmm6,32(%esp) + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,%ebx + movl %ecx,8(%ebp) + xorl %edx,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movl %esi,%ebp + andl %ebx,%esi + movl %ebp,%ebx + jmp L007loop +.align 4,0x90 +L008done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroall + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .align 6,0x90 LK_XX_XX: .long 1518500249,1518500249,1518500249,1518500249 diff --git a/packager/third_party/boringssl/mac-x86/crypto/sha/sha256-586.S b/packager/third_party/boringssl/mac-x86/crypto/sha/sha256-586.S index f0ba612fab..841854f7a9 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/sha/sha256-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/sha/sha256-586.S @@ -36,11 +36,10 @@ L000pic_point: jz L003no_xmm andl $1073741824,%ecx andl $268435968,%ebx - testl $536870912,%edx - jnz L004shaext orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx + je L004AVX testl $512,%ebx jnz L005SSSE3 L003no_xmm: @@ -3166,204 +3165,6 @@ L009grand_loop: popl %ebp ret .align 5,0x90 -L004shaext: - subl $32,%esp - movdqu (%esi),%xmm1 - leal 128(%ebp),%ebp - movdqu 16(%esi),%xmm2 - movdqa 128(%ebp),%xmm7 - pshufd $27,%xmm1,%xmm0 - pshufd $177,%xmm1,%xmm1 - pshufd $27,%xmm2,%xmm2 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp L010loop_shaext -.align 4,0x90 -L010loop_shaext: - movdqu (%edi),%xmm3 - movdqu 16(%edi),%xmm4 - movdqu 32(%edi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%edi),%xmm6 - movdqa %xmm2,16(%esp) - movdqa -128(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - nop - movdqa %xmm1,(%esp) -.byte 15,56,203,202 - movdqa -112(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - leal 64(%edi),%edi -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa -96(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa -80(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa -64(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa -48(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa -32(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa -16(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa (%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 16(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 32(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 48(%ebp),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 64(%ebp),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 80(%ebp),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - movdqa 96(%ebp),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa 128(%ebp),%xmm7 -.byte 15,56,203,202 - movdqa 112(%ebp),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $14,%xmm0,%xmm0 - cmpl %edi,%eax - nop -.byte 15,56,203,202 - paddd 16(%esp),%xmm2 - paddd (%esp),%xmm1 - jnz L010loop_shaext - pshufd $177,%xmm2,%xmm2 - pshufd $27,%xmm1,%xmm7 - pshufd $177,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - movl 44(%esp),%esp - movdqu %xmm1,(%esi) - movdqu %xmm2,16(%esi) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 5,0x90 L005SSSE3: leal -96(%esp),%esp movl (%esi),%eax @@ -3383,9 +3184,9 @@ L005SSSE3: movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp L011grand_ssse3 + jmp L010grand_ssse3 .align 4,0x90 -L011grand_ssse3: +L010grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 @@ -3408,9 +3209,9 @@ L011grand_ssse3: paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp L012ssse3_00_47 + jmp L011ssse3_00_47 .align 4,0x90 -L012ssse3_00_47: +L011ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 @@ -4053,7 +3854,7 @@ L012ssse3_00_47: addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne L012ssse3_00_47 + jne L011ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi @@ -4567,13 +4368,1194 @@ L012ssse3_00_47: movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb L011grand_ssse3 + jb L010grand_ssse3 movl 108(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret +.align 5,0x90 +L004AVX: + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp L012grand_avx +.align 5,0x90 +L012grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp L013avx_00_47 +.align 4,0x90 +L013avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne L013avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb L012grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .section __IMPORT,__pointers,non_lazy_symbol_pointers L_OPENSSL_ia32cap_P$non_lazy_ptr: .indirect_symbol _OPENSSL_ia32cap_P diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/aes/aes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/aes/aes-x86_64.S index 02f378bd67..b5d188a0f3 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/aes/aes-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/aes/aes-x86_64.S @@ -82,8 +82,8 @@ L$enc_loop: movl 0(%r14,%rdi,8),%edi movl 0(%r14,%rbp,8),%ebp - andl $65280,%edi - andl $65280,%ebp + andl $0x0000ff00,%edi + andl $0x0000ff00,%ebp xorl %edi,%r10d xorl %ebp,%r11d @@ -95,8 +95,8 @@ L$enc_loop: movl 0(%r14,%rsi,8),%esi movl 0(%r14,%rdi,8),%edi - andl $65280,%esi - andl $65280,%edi + andl $0x0000ff00,%esi + andl $0x0000ff00,%edi shrl $16,%ebx xorl %esi,%r12d xorl %edi,%r8d @@ -109,9 +109,9 @@ L$enc_loop: movl 0(%r14,%rdi,8),%edi movl 0(%r14,%rbp,8),%ebp - andl $16711680,%esi - andl $16711680,%edi - andl $16711680,%ebp + andl $0x00ff0000,%esi + andl $0x00ff0000,%edi + andl $0x00ff0000,%ebp xorl %esi,%r10d xorl %edi,%r11d @@ -124,9 +124,9 @@ L$enc_loop: movl 2(%r14,%rdi,8),%edi movl 2(%r14,%rbp,8),%ebp - andl $16711680,%esi - andl $4278190080,%edi - andl $4278190080,%ebp + andl $0x00ff0000,%esi + andl $0xff000000,%edi + andl $0xff000000,%ebp xorl %esi,%r8d xorl %edi,%r10d @@ -139,8 +139,8 @@ L$enc_loop: movl 2(%r14,%rdi,8),%edi movl 16+0(%r15),%eax - andl $4278190080,%esi - andl $4278190080,%edi + andl $0xff000000,%esi + andl $0xff000000,%edi xorl %esi,%r12d xorl %edi,%r8d @@ -242,8 +242,8 @@ L$enc_loop_compact: xorl %r8d,%edx cmpq 16(%rsp),%r15 je L$enc_compact_done - movl $2155905152,%r10d - movl $2155905152,%r11d + movl $0x80808080,%r10d + movl $0x80808080,%r11d andl %eax,%r10d andl %ebx,%r11d movl %r10d,%esi @@ -254,10 +254,10 @@ L$enc_loop_compact: leal (%rbx,%rbx,1),%r9d subl %r10d,%esi subl %r11d,%edi - andl $4278124286,%r8d - andl $4278124286,%r9d - andl $454761243,%esi - andl $454761243,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi movl %eax,%r10d movl %ebx,%r11d xorl %esi,%r8d @@ -265,9 +265,9 @@ L$enc_loop_compact: xorl %r8d,%eax xorl %r9d,%ebx - movl $2155905152,%r12d + movl $0x80808080,%r12d roll $24,%eax - movl $2155905152,%ebp + movl $0x80808080,%ebp roll $24,%ebx andl %ecx,%r12d andl %edx,%ebp @@ -290,10 +290,10 @@ L$enc_loop_compact: xorl %r10d,%eax xorl %r11d,%ebx - andl $4278124286,%r8d - andl $4278124286,%r9d - andl $454761243,%esi - andl $454761243,%edi + andl $0xfefefefe,%r8d + andl $0xfefefefe,%r9d + andl $0x1b1b1b1b,%esi + andl $0x1b1b1b1b,%edi movl %ecx,%r12d movl %edx,%ebp xorl %esi,%r8d @@ -345,7 +345,7 @@ _asm_AES_encrypt: andq $-64,%rsp subq %rsp,%rcx negq %rcx - andq $960,%rcx + andq $0x3c0,%rcx subq %rcx,%rsp subq $32,%rsp @@ -370,7 +370,7 @@ L$enc_prologue: leaq L$AES_Te+2048(%rip),%r14 leaq 768(%rsp),%rbp subq %r14,%rbp - andq $768,%rbp + andq $0x300,%rbp leaq (%r14,%rbp,1),%r14 call _x86_64_AES_encrypt_compact @@ -791,7 +791,7 @@ _asm_AES_decrypt: andq $-64,%rsp subq %rsp,%rcx negq %rcx - andq $960,%rcx + andq $0x3c0,%rcx subq %rcx,%rsp subq $32,%rsp @@ -816,7 +816,7 @@ L$dec_prologue: leaq L$AES_Td+2048(%rip),%r14 leaq 768(%rsp),%rbp subq %r14,%rbp - andq $768,%rbp + andq $0x300,%rbp leaq (%r14,%rbp,1),%r14 shrq $3,%rbp addq %rbp,%r14 @@ -1333,9 +1333,9 @@ L$cbc_picked_te: movq %r14,%r10 leaq 2304(%r14),%r11 movq %r15,%r12 - andq $4095,%r10 - andq $4095,%r11 - andq $4095,%r12 + andq $0xFFF,%r10 + andq $0xFFF,%r11 + andq $0xFFF,%r12 cmpq %r11,%r12 jb L$cbc_te_break_out @@ -1344,7 +1344,7 @@ L$cbc_picked_te: jmp L$cbc_te_ok L$cbc_te_break_out: subq %r10,%r12 - andq $4095,%r12 + andq $0xFFF,%r12 addq $320,%r12 subq %r12,%r15 .p2align 2 @@ -1370,7 +1370,7 @@ L$cbc_fast_body: movq %r15,%r10 subq %r14,%r10 - andq $4095,%r10 + andq $0xfff,%r10 cmpq $2304,%r10 jb L$cbc_do_ecopy cmpq $4096-248,%r10 @@ -1557,7 +1557,7 @@ L$cbc_slow_prologue: leaq -88-63(%rcx),%r10 subq %rbp,%r10 negq %r10 - andq $960,%r10 + andq $0x3c0,%r10 subq %r10,%rbp xchgq %rsp,%rbp @@ -1586,7 +1586,7 @@ L$cbc_slow_body: leaq 2048(%r14),%r14 leaq 768-8(%rsp),%rax subq %r14,%rax - andq $768,%rax + andq $0x300,%rax leaq (%r14,%rax,1),%r14 cmpq $0,%rbx diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/aes/aesni-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/aes/aesni-x86_64.S index 69b22c26b9..3d98fa12b6 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/aes/aesni-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/aes/aesni-x86_64.S @@ -507,7 +507,7 @@ _aesni_ecb_encrypt: testl %r8d,%r8d jz L$ecb_decrypt - cmpq $128,%rdx + cmpq $0x80,%rdx jb L$ecb_enc_tail movdqu (%rdi),%xmm2 @@ -519,7 +519,7 @@ _aesni_ecb_encrypt: movdqu 96(%rdi),%xmm8 movdqu 112(%rdi),%xmm9 leaq 128(%rdi),%rdi - subq $128,%rdx + subq $0x80,%rdx jmp L$ecb_enc_loop8_enter .p2align 4 L$ecb_enc_loop8: @@ -547,7 +547,7 @@ L$ecb_enc_loop8_enter: call _aesni_encrypt8 - subq $128,%rdx + subq $0x80,%rdx jnc L$ecb_enc_loop8 movups %xmm2,(%rsi) @@ -561,22 +561,22 @@ L$ecb_enc_loop8_enter: movups %xmm8,96(%rsi) movups %xmm9,112(%rsi) leaq 128(%rsi),%rsi - addq $128,%rdx + addq $0x80,%rdx jz L$ecb_ret L$ecb_enc_tail: movups (%rdi),%xmm2 - cmpq $32,%rdx + cmpq $0x20,%rdx jb L$ecb_enc_one movups 16(%rdi),%xmm3 je L$ecb_enc_two movups 32(%rdi),%xmm4 - cmpq $64,%rdx + cmpq $0x40,%rdx jb L$ecb_enc_three movups 48(%rdi),%xmm5 je L$ecb_enc_four movups 64(%rdi),%xmm6 - cmpq $96,%rdx + cmpq $0x60,%rdx jb L$ecb_enc_five movups 80(%rdi),%xmm7 je L$ecb_enc_six @@ -650,7 +650,7 @@ L$ecb_enc_six: .p2align 4 L$ecb_decrypt: - cmpq $128,%rdx + cmpq $0x80,%rdx jb L$ecb_dec_tail movdqu (%rdi),%xmm2 @@ -662,7 +662,7 @@ L$ecb_decrypt: movdqu 96(%rdi),%xmm8 movdqu 112(%rdi),%xmm9 leaq 128(%rdi),%rdi - subq $128,%rdx + subq $0x80,%rdx jmp L$ecb_dec_loop8_enter .p2align 4 L$ecb_dec_loop8: @@ -691,7 +691,7 @@ L$ecb_dec_loop8_enter: call _aesni_decrypt8 movups (%r11),%xmm0 - subq $128,%rdx + subq $0x80,%rdx jnc L$ecb_dec_loop8 movups %xmm2,(%rsi) @@ -713,22 +713,22 @@ L$ecb_dec_loop8_enter: movups %xmm9,112(%rsi) pxor %xmm9,%xmm9 leaq 128(%rsi),%rsi - addq $128,%rdx + addq $0x80,%rdx jz L$ecb_ret L$ecb_dec_tail: movups (%rdi),%xmm2 - cmpq $32,%rdx + cmpq $0x20,%rdx jb L$ecb_dec_one movups 16(%rdi),%xmm3 je L$ecb_dec_two movups 32(%rdi),%xmm4 - cmpq $64,%rdx + cmpq $0x40,%rdx jb L$ecb_dec_three movups 48(%rdi),%xmm5 je L$ecb_dec_four movups 64(%rdi),%xmm6 - cmpq $96,%rdx + cmpq $0x60,%rdx jb L$ecb_dec_five movups 80(%rdi),%xmm7 je L$ecb_dec_six @@ -1606,7 +1606,7 @@ L$oop_enc1_8: movdqa L$xts_magic(%rip),%xmm8 movdqa %xmm2,%xmm15 - pshufd $95,%xmm2,%xmm9 + pshufd $0x5f,%xmm2,%xmm9 pxor %xmm0,%xmm1 movdqa %xmm9,%xmm14 paddd %xmm9,%xmm9 @@ -1705,7 +1705,7 @@ L$xts_enc_grandloop: .byte 102,15,56,220,248 movups 64(%r11),%xmm0 movdqa %xmm8,80(%rsp) - pshufd $95,%xmm15,%xmm9 + pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_enc_loop6 .p2align 5 L$xts_enc_loop6: @@ -1844,13 +1844,13 @@ L$xts_enc_short: jz L$xts_enc_done pxor %xmm0,%xmm11 - cmpq $32,%rdx + cmpq $0x20,%rdx jb L$xts_enc_one pxor %xmm0,%xmm12 je L$xts_enc_two pxor %xmm0,%xmm13 - cmpq $64,%rdx + cmpq $0x40,%rdx jb L$xts_enc_three pxor %xmm0,%xmm14 je L$xts_enc_four @@ -2078,7 +2078,7 @@ L$oop_enc1_11: movdqa L$xts_magic(%rip),%xmm8 movdqa %xmm2,%xmm15 - pshufd $95,%xmm2,%xmm9 + pshufd $0x5f,%xmm2,%xmm9 pxor %xmm0,%xmm1 movdqa %xmm9,%xmm14 paddd %xmm9,%xmm9 @@ -2177,7 +2177,7 @@ L$xts_dec_grandloop: .byte 102,15,56,222,248 movups 64(%r11),%xmm0 movdqa %xmm8,80(%rsp) - pshufd $95,%xmm15,%xmm9 + pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_dec_loop6 .p2align 5 L$xts_dec_loop6: @@ -2317,13 +2317,13 @@ L$xts_dec_short: jz L$xts_dec_done pxor %xmm0,%xmm12 - cmpq $32,%rdx + cmpq $0x20,%rdx jb L$xts_dec_one pxor %xmm0,%xmm13 je L$xts_dec_two pxor %xmm0,%xmm14 - cmpq $64,%rdx + cmpq $0x40,%rdx jb L$xts_dec_three je L$xts_dec_four @@ -2354,7 +2354,7 @@ L$xts_dec_short: pcmpgtd %xmm15,%xmm14 movdqu %xmm6,64(%rsi) leaq 80(%rsi),%rsi - pshufd $19,%xmm14,%xmm11 + pshufd $0x13,%xmm14,%xmm11 andq $15,%r9 jz L$xts_dec_ret @@ -2644,7 +2644,7 @@ L$cbc_decrypt_bulk: leaq -8(%rax),%rbp movups (%r8),%xmm10 movl %r10d,%eax - cmpq $80,%rdx + cmpq $0x50,%rdx jbe L$cbc_dec_tail movups (%rcx),%xmm0 @@ -2660,14 +2660,14 @@ L$cbc_decrypt_bulk: movdqu 80(%rdi),%xmm7 movdqa %xmm6,%xmm15 movl _OPENSSL_ia32cap_P+4(%rip),%r9d - cmpq $112,%rdx + cmpq $0x70,%rdx jbe L$cbc_dec_six_or_seven andl $71303168,%r9d - subq $80,%rdx + subq $0x50,%rdx cmpl $4194304,%r9d je L$cbc_dec_loop6_enter - subq $32,%rdx + subq $0x20,%rdx leaq 112(%rcx),%rcx jmp L$cbc_dec_loop8_enter .p2align 4 @@ -2682,7 +2682,7 @@ L$cbc_dec_loop8_enter: movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 xorq %r11,%r11 - cmpq $112,%rdx + cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 pxor %xmm0,%xmm7 @@ -2867,21 +2867,21 @@ L$cbc_dec_done: movups %xmm8,96(%rsi) leaq 112(%rsi),%rsi - subq $128,%rdx + subq $0x80,%rdx ja L$cbc_dec_loop8 movaps %xmm9,%xmm2 leaq -112(%rcx),%rcx - addq $112,%rdx + addq $0x70,%rdx jle L$cbc_dec_clear_tail_collected movups %xmm9,(%rsi) leaq 16(%rsi),%rsi - cmpq $80,%rdx + cmpq $0x50,%rdx jbe L$cbc_dec_tail movaps %xmm11,%xmm2 L$cbc_dec_six_or_seven: - cmpq $96,%rdx + cmpq $0x60,%rdx ja L$cbc_dec_seven movaps %xmm7,%xmm8 @@ -2974,33 +2974,33 @@ L$cbc_dec_loop6_enter: movl %r10d,%eax movdqu %xmm6,64(%rsi) leaq 80(%rsi),%rsi - subq $96,%rdx + subq $0x60,%rdx ja L$cbc_dec_loop6 movdqa %xmm7,%xmm2 - addq $80,%rdx + addq $0x50,%rdx jle L$cbc_dec_clear_tail_collected movups %xmm7,(%rsi) leaq 16(%rsi),%rsi L$cbc_dec_tail: movups (%rdi),%xmm2 - subq $16,%rdx + subq $0x10,%rdx jbe L$cbc_dec_one movups 16(%rdi),%xmm3 movaps %xmm2,%xmm11 - subq $16,%rdx + subq $0x10,%rdx jbe L$cbc_dec_two movups 32(%rdi),%xmm4 movaps %xmm3,%xmm12 - subq $16,%rdx + subq $0x10,%rdx jbe L$cbc_dec_three movups 48(%rdi),%xmm5 movaps %xmm4,%xmm13 - subq $16,%rdx + subq $0x10,%rdx jbe L$cbc_dec_four movups 64(%rdi),%xmm6 @@ -3025,7 +3025,7 @@ L$cbc_dec_tail: movdqa %xmm6,%xmm2 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 - subq $16,%rdx + subq $0x10,%rdx jmp L$cbc_dec_tail_collected .p2align 4 @@ -3344,7 +3344,7 @@ L$oop_key192: pslldq $4,%xmm0 pxor %xmm3,%xmm0 - pshufd $255,%xmm0,%xmm3 + pshufd $0xff,%xmm0,%xmm3 pxor %xmm1,%xmm3 pslldq $4,%xmm1 pxor %xmm1,%xmm3 @@ -3431,7 +3431,7 @@ L$oop_key256: decl %r10d jz L$done_key256 - pshufd $255,%xmm0,%xmm2 + pshufd $0xff,%xmm0,%xmm2 pxor %xmm3,%xmm3 .byte 102,15,56,221,211 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/aes/bsaes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/aes/bsaes-x86_64.S index c2d04776a9..ad802e3d5d 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/aes/bsaes-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/aes/bsaes-x86_64.S @@ -325,45 +325,45 @@ L$enc_sbox: pxor %xmm2,%xmm5 decl %r10d jl L$enc_done - pshufd $147,%xmm15,%xmm7 - pshufd $147,%xmm0,%xmm8 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 pxor %xmm7,%xmm15 - pshufd $147,%xmm3,%xmm9 + pshufd $0x93,%xmm3,%xmm9 pxor %xmm8,%xmm0 - pshufd $147,%xmm5,%xmm10 + pshufd $0x93,%xmm5,%xmm10 pxor %xmm9,%xmm3 - pshufd $147,%xmm2,%xmm11 + pshufd $0x93,%xmm2,%xmm11 pxor %xmm10,%xmm5 - pshufd $147,%xmm6,%xmm12 + pshufd $0x93,%xmm6,%xmm12 pxor %xmm11,%xmm2 - pshufd $147,%xmm1,%xmm13 + pshufd $0x93,%xmm1,%xmm13 pxor %xmm12,%xmm6 - pshufd $147,%xmm4,%xmm14 + pshufd $0x93,%xmm4,%xmm14 pxor %xmm13,%xmm1 pxor %xmm14,%xmm4 pxor %xmm15,%xmm8 pxor %xmm4,%xmm7 pxor %xmm4,%xmm8 - pshufd $78,%xmm15,%xmm15 + pshufd $0x4E,%xmm15,%xmm15 pxor %xmm0,%xmm9 - pshufd $78,%xmm0,%xmm0 + pshufd $0x4E,%xmm0,%xmm0 pxor %xmm2,%xmm12 pxor %xmm7,%xmm15 pxor %xmm6,%xmm13 pxor %xmm8,%xmm0 pxor %xmm5,%xmm11 - pshufd $78,%xmm2,%xmm7 + pshufd $0x4E,%xmm2,%xmm7 pxor %xmm1,%xmm14 - pshufd $78,%xmm6,%xmm8 + pshufd $0x4E,%xmm6,%xmm8 pxor %xmm3,%xmm10 - pshufd $78,%xmm5,%xmm2 + pshufd $0x4E,%xmm5,%xmm2 pxor %xmm4,%xmm10 - pshufd $78,%xmm4,%xmm6 + pshufd $0x4E,%xmm4,%xmm6 pxor %xmm4,%xmm11 - pshufd $78,%xmm1,%xmm5 + pshufd $0x4E,%xmm1,%xmm5 pxor %xmm11,%xmm7 - pshufd $78,%xmm3,%xmm1 + pshufd $0x4E,%xmm3,%xmm1 pxor %xmm12,%xmm8 pxor %xmm10,%xmm2 pxor %xmm14,%xmm6 @@ -797,24 +797,24 @@ L$dec_sbox: decl %r10d jl L$dec_done - pshufd $78,%xmm15,%xmm7 - pshufd $78,%xmm2,%xmm13 + pshufd $0x4E,%xmm15,%xmm7 + pshufd $0x4E,%xmm2,%xmm13 pxor %xmm15,%xmm7 - pshufd $78,%xmm4,%xmm14 + pshufd $0x4E,%xmm4,%xmm14 pxor %xmm2,%xmm13 - pshufd $78,%xmm0,%xmm8 + pshufd $0x4E,%xmm0,%xmm8 pxor %xmm4,%xmm14 - pshufd $78,%xmm5,%xmm9 + pshufd $0x4E,%xmm5,%xmm9 pxor %xmm0,%xmm8 - pshufd $78,%xmm3,%xmm10 + pshufd $0x4E,%xmm3,%xmm10 pxor %xmm5,%xmm9 pxor %xmm13,%xmm15 pxor %xmm13,%xmm0 - pshufd $78,%xmm1,%xmm11 + pshufd $0x4E,%xmm1,%xmm11 pxor %xmm3,%xmm10 pxor %xmm7,%xmm5 pxor %xmm8,%xmm3 - pshufd $78,%xmm6,%xmm12 + pshufd $0x4E,%xmm6,%xmm12 pxor %xmm1,%xmm11 pxor %xmm14,%xmm0 pxor %xmm9,%xmm1 @@ -828,45 +828,45 @@ L$dec_sbox: pxor %xmm14,%xmm1 pxor %xmm14,%xmm6 pxor %xmm12,%xmm4 - pshufd $147,%xmm15,%xmm7 - pshufd $147,%xmm0,%xmm8 + pshufd $0x93,%xmm15,%xmm7 + pshufd $0x93,%xmm0,%xmm8 pxor %xmm7,%xmm15 - pshufd $147,%xmm5,%xmm9 + pshufd $0x93,%xmm5,%xmm9 pxor %xmm8,%xmm0 - pshufd $147,%xmm3,%xmm10 + pshufd $0x93,%xmm3,%xmm10 pxor %xmm9,%xmm5 - pshufd $147,%xmm1,%xmm11 + pshufd $0x93,%xmm1,%xmm11 pxor %xmm10,%xmm3 - pshufd $147,%xmm6,%xmm12 + pshufd $0x93,%xmm6,%xmm12 pxor %xmm11,%xmm1 - pshufd $147,%xmm2,%xmm13 + pshufd $0x93,%xmm2,%xmm13 pxor %xmm12,%xmm6 - pshufd $147,%xmm4,%xmm14 + pshufd $0x93,%xmm4,%xmm14 pxor %xmm13,%xmm2 pxor %xmm14,%xmm4 pxor %xmm15,%xmm8 pxor %xmm4,%xmm7 pxor %xmm4,%xmm8 - pshufd $78,%xmm15,%xmm15 + pshufd $0x4E,%xmm15,%xmm15 pxor %xmm0,%xmm9 - pshufd $78,%xmm0,%xmm0 + pshufd $0x4E,%xmm0,%xmm0 pxor %xmm1,%xmm12 pxor %xmm7,%xmm15 pxor %xmm6,%xmm13 pxor %xmm8,%xmm0 pxor %xmm3,%xmm11 - pshufd $78,%xmm1,%xmm7 + pshufd $0x4E,%xmm1,%xmm7 pxor %xmm2,%xmm14 - pshufd $78,%xmm6,%xmm8 + pshufd $0x4E,%xmm6,%xmm8 pxor %xmm5,%xmm10 - pshufd $78,%xmm3,%xmm1 + pshufd $0x4E,%xmm3,%xmm1 pxor %xmm4,%xmm10 - pshufd $78,%xmm4,%xmm6 + pshufd $0x4E,%xmm4,%xmm6 pxor %xmm4,%xmm11 - pshufd $78,%xmm2,%xmm3 + pshufd $0x4E,%xmm2,%xmm3 pxor %xmm11,%xmm7 - pshufd $78,%xmm5,%xmm2 + pshufd $0x4E,%xmm5,%xmm2 pxor %xmm12,%xmm8 pxor %xmm1,%xmm10 pxor %xmm14,%xmm6 @@ -1556,20 +1556,20 @@ L$xts_enc_prologue: movdqa %xmm7,(%rax) andq $-16,%r14 - subq $128,%rsp + subq $0x80,%rsp movdqa 32(%rbp),%xmm6 pxor %xmm14,%xmm14 movdqa L$xts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - subq $128,%r14 + subq $0x80,%r14 jc L$xts_enc_short jmp L$xts_enc_loop .p2align 4 L$xts_enc_loop: - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -1577,7 +1577,7 @@ L$xts_enc_loop: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -1586,7 +1586,7 @@ L$xts_enc_loop: pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 movdqu 0(%r12),%xmm7 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -1596,7 +1596,7 @@ L$xts_enc_loop: pxor %xmm13,%xmm6 movdqu 16(%r12),%xmm8 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -1606,7 +1606,7 @@ L$xts_enc_loop: pxor %xmm13,%xmm6 movdqu 32(%r12),%xmm9 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -1616,7 +1616,7 @@ L$xts_enc_loop: pxor %xmm13,%xmm6 movdqu 48(%r12),%xmm10 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -1626,7 +1626,7 @@ L$xts_enc_loop: pxor %xmm13,%xmm6 movdqu 64(%r12),%xmm11 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -1670,20 +1670,20 @@ L$xts_enc_loop: pxor %xmm14,%xmm14 movdqa L$xts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - subq $128,%r14 + subq $0x80,%r14 jnc L$xts_enc_loop L$xts_enc_short: - addq $128,%r14 + addq $0x80,%r14 jz L$xts_enc_done - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -1691,7 +1691,7 @@ L$xts_enc_short: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -1702,7 +1702,7 @@ L$xts_enc_short: movdqu 0(%r12),%xmm7 cmpq $16,%r14 je L$xts_enc_1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -1714,7 +1714,7 @@ L$xts_enc_short: cmpq $32,%r14 je L$xts_enc_2 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -1726,7 +1726,7 @@ L$xts_enc_short: cmpq $48,%r14 je L$xts_enc_3 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -1738,7 +1738,7 @@ L$xts_enc_short: cmpq $64,%r14 je L$xts_enc_4 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -1750,7 +1750,7 @@ L$xts_enc_short: cmpq $80,%r14 je L$xts_enc_5 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2016,20 +2016,20 @@ L$xts_dec_prologue: shlq $4,%rax subq %rax,%r14 - subq $128,%rsp + subq $0x80,%rsp movdqa 32(%rbp),%xmm6 pxor %xmm14,%xmm14 movdqa L$xts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - subq $128,%r14 + subq $0x80,%r14 jc L$xts_dec_short jmp L$xts_dec_loop .p2align 4 L$xts_dec_loop: - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -2037,7 +2037,7 @@ L$xts_dec_loop: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -2046,7 +2046,7 @@ L$xts_dec_loop: pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 movdqu 0(%r12),%xmm7 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -2056,7 +2056,7 @@ L$xts_dec_loop: pxor %xmm13,%xmm6 movdqu 16(%r12),%xmm8 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -2066,7 +2066,7 @@ L$xts_dec_loop: pxor %xmm13,%xmm6 movdqu 32(%r12),%xmm9 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -2076,7 +2076,7 @@ L$xts_dec_loop: pxor %xmm13,%xmm6 movdqu 48(%r12),%xmm10 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -2086,7 +2086,7 @@ L$xts_dec_loop: pxor %xmm13,%xmm6 movdqu 64(%r12),%xmm11 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2130,20 +2130,20 @@ L$xts_dec_loop: pxor %xmm14,%xmm14 movdqa L$xts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - subq $128,%r14 + subq $0x80,%r14 jnc L$xts_dec_loop L$xts_dec_short: - addq $128,%r14 + addq $0x80,%r14 jz L$xts_dec_done - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm15 movdqa %xmm6,0(%rsp) @@ -2151,7 +2151,7 @@ L$xts_dec_short: pand %xmm12,%xmm13 pcmpgtd %xmm6,%xmm14 pxor %xmm13,%xmm6 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm0 movdqa %xmm6,16(%rsp) @@ -2162,7 +2162,7 @@ L$xts_dec_short: movdqu 0(%r12),%xmm7 cmpq $16,%r14 je L$xts_dec_1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm1 movdqa %xmm6,32(%rsp) @@ -2174,7 +2174,7 @@ L$xts_dec_short: cmpq $32,%r14 je L$xts_dec_2 pxor %xmm7,%xmm15 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm2 movdqa %xmm6,48(%rsp) @@ -2186,7 +2186,7 @@ L$xts_dec_short: cmpq $48,%r14 je L$xts_dec_3 pxor %xmm8,%xmm0 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm3 movdqa %xmm6,64(%rsp) @@ -2198,7 +2198,7 @@ L$xts_dec_short: cmpq $64,%r14 je L$xts_dec_4 pxor %xmm9,%xmm1 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm4 movdqa %xmm6,80(%rsp) @@ -2210,7 +2210,7 @@ L$xts_dec_short: cmpq $80,%r14 je L$xts_dec_5 pxor %xmm10,%xmm2 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 pxor %xmm14,%xmm14 movdqa %xmm6,%xmm5 movdqa %xmm6,96(%rsp) @@ -2387,7 +2387,7 @@ L$xts_dec_done: pxor %xmm14,%xmm14 movdqa L$xts_magic(%rip),%xmm12 pcmpgtd %xmm6,%xmm14 - pshufd $19,%xmm14,%xmm13 + pshufd $0x13,%xmm14,%xmm13 movdqa %xmm6,%xmm5 paddq %xmm6,%xmm6 pand %xmm12,%xmm13 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/aes/vpaes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/aes/vpaes-x86_64.S index 711ea43726..997cde807a 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/aes/vpaes-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/aes/vpaes-x86_64.S @@ -61,7 +61,7 @@ L$enc_loop: addq $16,%r11 pxor %xmm0,%xmm3 .byte 102,15,56,0,193 - andq $48,%r11 + andq $0x30,%r11 subq $1,%rax pxor %xmm3,%xmm0 @@ -121,10 +121,10 @@ _vpaes_decrypt_core: pand %xmm9,%xmm0 .byte 102,15,56,0,208 movdqa L$k_dipt+16(%rip),%xmm0 - xorq $48,%r11 + xorq $0x30,%r11 leaq L$k_dsbd(%rip),%r10 .byte 102,15,56,0,193 - andq $48,%r11 + andq $0x30,%r11 pxor %xmm5,%xmm2 movdqa L$k_mc_forward+48(%rip),%xmm5 pxor %xmm2,%xmm0 @@ -243,7 +243,7 @@ L$schedule_am_decrypting: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 movdqu %xmm3,(%rdx) - xorq $48,%r8 + xorq $0x30,%r8 L$schedule_go: cmpl $192,%esi @@ -333,7 +333,7 @@ L$oop_schedule_256: call _vpaes_schedule_mangle - pshufd $255,%xmm0,%xmm0 + pshufd $0xFF,%xmm0,%xmm0 movdqa %xmm7,%xmm5 movdqa %xmm6,%xmm7 call _vpaes_schedule_low_round @@ -400,8 +400,8 @@ L$schedule_mangle_last_dec: .p2align 4 _vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm1 - pshufd $254,%xmm7,%xmm0 + pshufd $0x80,%xmm6,%xmm1 + pshufd $0xFE,%xmm7,%xmm0 pxor %xmm1,%xmm6 pxor %xmm1,%xmm1 pxor %xmm0,%xmm6 @@ -438,7 +438,7 @@ _vpaes_schedule_round: pxor %xmm1,%xmm7 - pshufd $255,%xmm0,%xmm0 + pshufd $0xFF,%xmm0,%xmm0 .byte 102,15,58,15,192,1 @@ -597,7 +597,7 @@ L$schedule_mangle_both: movdqa (%r8,%r10,1),%xmm1 .byte 102,15,56,0,217 addq $-16,%r8 - andq $48,%r8 + andq $0x30,%r8 movdqu %xmm3,(%rdx) .byte 0xf3,0xc3 @@ -616,7 +616,7 @@ _vpaes_set_encrypt_key: movl %eax,240(%rdx) movl $0,%ecx - movl $48,%r8d + movl $0x30,%r8d call _vpaes_schedule_core xorl %eax,%eax .byte 0xf3,0xc3 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/bn/rsaz-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/bn/rsaz-x86_64.S index 5e9e82feb0..337276f9cb 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/bn/rsaz-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/bn/rsaz-x86_64.S @@ -465,48 +465,94 @@ _rsaz_512_mul_gather4: pushq %r14 pushq %r15 - movl %r9d,%r9d - subq $128+24,%rsp + subq $152,%rsp L$mul_gather4_body: - movl 64(%rdx,%r9,4),%eax -.byte 102,72,15,110,199 - movl (%rdx,%r9,4),%ebx -.byte 102,72,15,110,201 - movq %r8,128(%rsp) + movd %r9d,%xmm8 + movdqa L$inc+16(%rip),%xmm1 + movdqa L$inc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 + + movdqa 0(%rdx),%xmm8 + movdqa 16(%rdx),%xmm9 + movdqa 32(%rdx),%xmm10 + movdqa 48(%rdx),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rdx),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rdx),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rdx),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rdx),%xmm15 + leaq 128(%rdx),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) - shlq $32,%rax - orq %rax,%rbx movq (%rsi),%rax movq 8(%rsi),%rcx - leaq 128(%rdx,%r9,4),%rbp mulq %rbx movq %rax,(%rsp) movq %rcx,%rax movq %rdx,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 @@ -519,14 +565,12 @@ L$mul_gather4_body: adcq $0,%r13 mulq %rbx - leaq 128(%rbp),%rbp addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r14 movq (%rsi),%rax movq %rdx,%r15 @@ -538,6 +582,35 @@ L$mul_gather4_body: .p2align 5 L$oop_mul_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,195 + mulq %rbx addq %rax,%r8 movq 8(%rsi),%rax @@ -546,7 +619,6 @@ L$oop_mul_gather: adcq $0,%r8 mulq %rbx - movd (%rbp),%xmm4 addq %rax,%r9 movq 16(%rsi),%rax adcq $0,%rdx @@ -555,7 +627,6 @@ L$oop_mul_gather: adcq $0,%r9 mulq %rbx - movd 64(%rbp),%xmm5 addq %rax,%r10 movq 24(%rsi),%rax adcq $0,%rdx @@ -564,7 +635,6 @@ L$oop_mul_gather: adcq $0,%r10 mulq %rbx - pslldq $4,%xmm5 addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx @@ -573,7 +643,6 @@ L$oop_mul_gather: adcq $0,%r11 mulq %rbx - por %xmm5,%xmm4 addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx @@ -598,7 +667,6 @@ L$oop_mul_gather: adcq $0,%r14 mulq %rbx -.byte 102,72,15,126,227 addq %rax,%r15 movq (%rsi),%rax adcq $0,%rdx @@ -606,7 +674,6 @@ L$oop_mul_gather: movq %rdx,%r15 adcq $0,%r15 - leaq 128(%rbp),%rbp leaq 8(%rdi),%rdi decl %ecx @@ -621,8 +688,8 @@ L$oop_mul_gather: movq %r14,48(%rdi) movq %r15,56(%rdi) -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp movq (%rsp),%r8 movq 8(%rsp),%r9 @@ -672,7 +739,7 @@ _rsaz_512_mul_scatter4: movl %r9d,%r9d subq $128+24,%rsp L$mul_scatter4_body: - leaq (%r8,%r9,4),%r8 + leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 .byte 102,72,15,110,202 .byte 102,73,15,110,208 @@ -708,30 +775,14 @@ L$mul_scatter4_body: call __rsaz_512_subtract - movl %r8d,0(%rsi) - shrq $32,%r8 - movl %r9d,128(%rsi) - shrq $32,%r9 - movl %r10d,256(%rsi) - shrq $32,%r10 - movl %r11d,384(%rsi) - shrq $32,%r11 - movl %r12d,512(%rsi) - shrq $32,%r12 - movl %r13d,640(%rsi) - shrq $32,%r13 - movl %r14d,768(%rsi) - shrq $32,%r14 - movl %r15d,896(%rsi) - shrq $32,%r15 - movl %r8d,64(%rsi) - movl %r9d,192(%rsi) - movl %r10d,320(%rsi) - movl %r11d,448(%rsi) - movl %r12d,576(%rsi) - movl %r13d,704(%rsi) - movl %r14d,832(%rsi) - movl %r15d,960(%rsi) + movq %r8,0(%rsi) + movq %r9,128(%rsi) + movq %r10,256(%rsi) + movq %r11,384(%rsi) + movq %r12,512(%rsi) + movq %r13,640(%rsi) + movq %r14,768(%rsi) + movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax movq -48(%rax),%r15 @@ -1086,16 +1137,14 @@ L$oop_mul: .p2align 4 _rsaz_512_scatter4: - leaq (%rdi,%rdx,4),%rdi + leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp L$oop_scatter .p2align 4 L$oop_scatter: movq (%rsi),%rax leaq 8(%rsi),%rsi - movl %eax,(%rdi) - shrq $32,%rax - movl %eax,64(%rdi) + movq %rax,(%rdi) leaq 128(%rdi),%rdi decl %r9d jnz L$oop_scatter @@ -1107,20 +1156,73 @@ L$oop_scatter: .p2align 4 _rsaz_512_gather4: - leaq (%rsi,%rdx,4),%rsi + movd %edx,%xmm8 + movdqa L$inc+16(%rip),%xmm1 + movdqa L$inc(%rip),%xmm0 + + pshufd $0,%xmm8,%xmm8 + movdqa %xmm1,%xmm7 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm8,%xmm0 + movdqa %xmm7,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm8,%xmm1 + movdqa %xmm7,%xmm4 + paddd %xmm2,%xmm3 + pcmpeqd %xmm8,%xmm2 + movdqa %xmm7,%xmm5 + paddd %xmm3,%xmm4 + pcmpeqd %xmm8,%xmm3 + movdqa %xmm7,%xmm6 + paddd %xmm4,%xmm5 + pcmpeqd %xmm8,%xmm4 + paddd %xmm5,%xmm6 + pcmpeqd %xmm8,%xmm5 + paddd %xmm6,%xmm7 + pcmpeqd %xmm8,%xmm6 + pcmpeqd %xmm8,%xmm7 movl $8,%r9d jmp L$oop_gather .p2align 4 L$oop_gather: - movl (%rsi),%eax - movl 64(%rsi),%r8d + movdqa 0(%rsi),%xmm8 + movdqa 16(%rsi),%xmm9 + movdqa 32(%rsi),%xmm10 + movdqa 48(%rsi),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rsi),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rsi),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rsi),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rsi),%xmm15 leaq 128(%rsi),%rsi - shlq $32,%r8 - orq %r8,%rax - movq %rax,(%rdi) + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 + movq %xmm8,(%rdi) leaq 8(%rdi),%rdi decl %r9d jnz L$oop_gather .byte 0xf3,0xc3 +L$SEH_end_rsaz_512_gather4: + +.p2align 6 +L$inc: +.long 0,0, 1,1 +.long 2,2, 2,2 #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S b/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S index 6b9bc05bf4..51e5d19931 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S @@ -634,20 +634,20 @@ L$sqr8x_enter: - leaq -64(%rsp,%r9,4),%r11 + leaq -64(%rsp,%r9,2),%r11 movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$sqr8x_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,4),%rsp + leaq -64(%rsp,%r9,2),%rsp jmp L$sqr8x_sp_done .p2align 5 L$sqr8x_sp_alt: - leaq 4096-64(,%r9,4),%r10 - leaq -64(%rsp,%r9,4),%rsp + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -657,58 +657,80 @@ L$sqr8x_sp_done: movq %r9,%r10 negq %r9 - leaq 64(%rsp,%r9,2),%r11 movq %r8,32(%rsp) movq %rax,40(%rsp) L$sqr8x_body: - movq %r9,%rbp -.byte 102,73,15,110,211 - shrq $3+2,%rbp - movl _OPENSSL_ia32cap_P+8(%rip),%eax - jmp L$sqr8x_copy_n - -.p2align 5 -L$sqr8x_copy_n: - movq 0(%rcx),%xmm0 - movq 8(%rcx),%xmm1 - movq 16(%rcx),%xmm3 - movq 24(%rcx),%xmm4 - leaq 32(%rcx),%rcx - movdqa %xmm0,0(%r11) - movdqa %xmm1,16(%r11) - movdqa %xmm3,32(%r11) - movdqa %xmm4,48(%r11) - leaq 64(%r11),%r11 - decq %rbp - jnz L$sqr8x_copy_n - +.byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 call _bn_sqr8x_internal - pxor %xmm0,%xmm0 - leaq 48(%rsp),%rax - leaq 64(%rsp,%r9,2),%rdx - shrq $3+2,%r9 - movq 40(%rsp),%rsi - jmp L$sqr8x_zero + + + + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %r9,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp L$sqr8x_sub .p2align 5 -L$sqr8x_zero: - movdqa %xmm0,0(%rax) - movdqa %xmm0,16(%rax) - movdqa %xmm0,32(%rax) - movdqa %xmm0,48(%rax) - leaq 64(%rax),%rax - movdqa %xmm0,0(%rdx) - movdqa %xmm0,16(%rdx) - movdqa %xmm0,32(%rdx) - movdqa %xmm0,48(%rdx) - leaq 64(%rdx),%rdx - decq %r9 - jnz L$sqr8x_zero +L$sqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz L$sqr8x_sub + + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi + +.byte 102,72,15,110,200 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi + jmp L$sqr8x_cond_copy + +.p2align 5 +L$sqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz L$sqr8x_cond_copy movq $1,%rax movq -48(%rsi),%r15 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S b/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S index 2e8f469c14..f3ad8d783f 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S @@ -16,46 +16,151 @@ _bn_mul_mont_gather5: L$mul_enter: movl %r9d,%r9d movq %rsp,%rax - movl 8(%rsp),%r10d + movd 8(%rsp),%xmm5 + leaq L$inc(%rip),%r10 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 + leaq 2(%r9),%r11 negq %r11 - leaq (%rsp,%r11,8),%rsp + leaq -264(%rsp,%r11,8),%rsp andq $-1024,%rsp movq %rax,8(%rsp,%r9,8) L$mul_body: - movq %rdx,%r12 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq L$magic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%r12,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 + leaq 128(%rdx),%r12 + movdqa 0(%r10),%xmm0 + movdqa 16(%r10),%xmm1 + leaq 24-112(%rsp,%r9,8),%r10 + andq $-16,%r10 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 leaq 256(%r12),%r12 - por %xmm3,%xmm0 - .byte 102,72,15,126,195 movq (%r8),%r8 @@ -64,29 +169,14 @@ L$mul_body: xorq %r14,%r14 xorq %r15,%r15 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -119,14 +209,12 @@ L$1st_enter: cmpq %r9,%r15 jne L$1st -.byte 102,72,15,126,195 addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 movq %r10,%r11 @@ -140,33 +228,78 @@ L$1st_enter: jmp L$outer .p2align 4 L$outer: + leaq 24+128(%rsp,%r9,8),%rdx + andq $-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + + movq (%rsi),%rax +.byte 102,72,15,126,195 + xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - imulq %r10,%rbp movq %rdx,%r11 - por %xmm2,%xmm0 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax @@ -202,15 +335,12 @@ L$inner_enter: cmpq %r9,%r15 jne L$inner -.byte 102,72,15,126,195 - addq %rax,%r13 - movq (%rsi),%rax adcq $0,%rdx addq %r10,%r13 - movq (%rsp,%r15,8),%r10 + movq (%rsp,%r9,8),%r10 adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) + movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 xorq %rdx,%rdx @@ -256,6 +386,7 @@ L$copy: movq 8(%rsp,%r9,8),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -278,10 +409,10 @@ L$mul4x_enter: pushq %r13 pushq %r14 pushq %r15 + .byte 0x67 - movl %r9d,%r10d shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 @@ -291,19 +422,21 @@ L$mul4x_enter: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$mul4xsp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp L$mul4xsp_done .p2align 5 L$mul4xsp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -319,6 +452,7 @@ L$mul4x_body: movq 40(%rsp),%rsi movq $1,%rax + movq -48(%rsi),%r15 movq -40(%rsi),%r14 movq -32(%rsi),%r13 @@ -334,47 +468,141 @@ L$mul4x_epilogue: .p2align 5 mul4x_internal: shlq $5,%r9 - movl 8(%rax),%r10d - leaq 256(%rdx,%r9,1),%r13 + movd 8(%rax),%xmm5 + leaq L$inc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 shrq $5,%r9 - movq %r10,%r11 - shrq $3,%r10 - andq $7,%r11 - notq %r10 - leaq L$magic_masks(%rip),%rax - andq $3,%r10 - leaq 96(%rdx,%r11,8),%r12 - movq 0(%rax,%r10,8),%xmm4 - movq 8(%rax,%r10,8),%xmm5 - addq $7,%r11 - movq 16(%rax,%r10,8),%xmm6 - movq 24(%rax,%r10,8),%xmm7 - andq $7,%r11 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 - movq -96(%r12),%xmm0 - leaq 256(%r12),%r14 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - pand %xmm6,%xmm2 + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 .byte 0x67 - por %xmm1,%xmm0 - movq -96(%r14),%xmm1 -.byte 0x67 - pand %xmm7,%xmm3 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 .byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 por %xmm2,%xmm0 - movq -32(%r14),%xmm2 -.byte 0x67 - pand %xmm4,%xmm1 -.byte 0x67 - por %xmm3,%xmm0 - movq 32(%r14),%xmm3 - + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 .byte 102,72,15,126,195 - movq 96(%r14),%xmm0 + movq %r13,16+8(%rsp) movq %rdi,56+8(%rsp) @@ -388,26 +616,10 @@ mul4x_internal: movq %rax,%r10 movq (%rcx),%rax - pand %xmm5,%xmm2 - pand %xmm6,%xmm3 - por %xmm2,%xmm1 - imulq %r10,%rbp - - - - - - - - leaq 64+8(%rsp,%r11,8),%r14 + leaq 64+8(%rsp),%r14 movq %rdx,%r11 - pand %xmm7,%xmm0 - por %xmm3,%xmm1 - leaq 512(%r12),%r12 - por %xmm1,%xmm0 - mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax @@ -416,7 +628,7 @@ mul4x_internal: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -426,7 +638,7 @@ mul4x_internal: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -436,7 +648,7 @@ mul4x_internal: L$1st4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -452,7 +664,7 @@ L$1st4x: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -482,7 +694,7 @@ L$1st4x: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -491,7 +703,7 @@ L$1st4x: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 @@ -501,7 +713,7 @@ L$1st4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 @@ -517,7 +729,7 @@ L$1st4x: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 @@ -530,8 +742,7 @@ L$1st4x: movq %rdi,-16(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -542,6 +753,63 @@ L$1st4x: .p2align 5 L$outer4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 +.byte 102,72,15,126,195 + movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx @@ -549,25 +817,11 @@ L$outer4x: movq (%rcx),%rax adcq $0,%rdx - movq -96(%r12),%xmm0 - movq -32(%r12),%xmm1 - pand %xmm4,%xmm0 - movq 32(%r12),%xmm2 - pand %xmm5,%xmm1 - movq 96(%r12),%xmm3 - imulq %r10,%rbp -.byte 0x67 movq %rdx,%r11 movq %rdi,(%r14) - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 - por %xmm2,%xmm0 leaq (%r14,%r9,1),%r14 - leaq 256(%r12),%r12 - por %xmm3,%xmm0 mulq %rbp addq %rax,%r10 @@ -577,7 +831,7 @@ L$outer4x: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -589,7 +843,7 @@ L$outer4x: adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp L$inner4x @@ -598,7 +852,7 @@ L$outer4x: L$inner4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -616,7 +870,7 @@ L$inner4x: mulq %rbx addq %rax,%r11 - movq -16(%rcx),%rax + movq -8(%rcx),%rax adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -650,7 +904,7 @@ L$inner4x: mulq %rbx addq %rax,%r11 - movq 16(%rcx),%rax + movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx @@ -661,7 +915,7 @@ L$inner4x: movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi - leaq 64(%rcx),%rcx + leaq 32(%rcx),%rcx adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%r13 @@ -671,7 +925,7 @@ L$inner4x: mulq %rbx addq %rax,%r10 - movq -32(%rcx),%rax + movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 @@ -690,7 +944,7 @@ L$inner4x: mulq %rbx addq %rax,%r11 movq %rbp,%rax - movq -16(%rcx),%rbp + movq -8(%rcx),%rbp adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx @@ -705,9 +959,8 @@ L$inner4x: movq %r13,-24(%r14) movq %rdx,%r13 -.byte 102,72,15,126,195 movq %rdi,-16(%r14) - leaq (%rcx,%r9,2),%rcx + leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 @@ -718,16 +971,23 @@ L$inner4x: cmpq 16+8(%rsp),%r12 jb L$outer4x + xorq %rax,%rax subq %r13,%rbp adcq %r15,%r15 orq %r15,%rdi - xorq $1,%rdi + subq %rdi,%rax leaq (%r14,%r9,1),%rbx - leaq (%rcx,%rdi,8),%rbp + movq (%rcx),%r12 + leaq (%rcx),%rbp movq %r9,%rcx sarq $3+2,%rcx movq 56+8(%rsp),%rdi - jmp L$sqr4x_sub + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqr4x_sub_entry .globl _bn_power5 .private_extern _bn_power5 @@ -741,9 +1001,9 @@ _bn_power5: pushq %r13 pushq %r14 pushq %r15 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leal (%r9,%r9,2),%r10d negq %r9 movq (%r8),%r8 @@ -753,19 +1013,20 @@ _bn_power5: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$pwr_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp L$pwr_sp_done .p2align 5 L$pwr_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -793,10 +1054,15 @@ L$power5_body: .byte 102,72,15,110,226 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal .byte 102,72,15,126,209 .byte 102,72,15,126,226 @@ -1341,9 +1607,9 @@ L$sqr4x_shift_n_add: movq %rbx,-16(%rdi) movq %r8,-8(%rdi) .byte 102,72,15,126,213 -sqr8x_reduction: +__bn_sqr8x_reduction: xorq %rax,%rax - leaq (%rbp,%r9,2),%rcx + leaq (%r9,%rbp,1),%rcx leaq 48+8(%rsp,%r9,2),%rdx movq %rcx,0+8(%rsp) leaq 48+8(%rsp,%r9,1),%rdi @@ -1376,14 +1642,14 @@ L$8x_reduction_loop: .p2align 5 L$8x_reduce: mulq %rbx - movq 16(%rbp),%rax + movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rbx,48-8+8(%rsp,%rcx,8) @@ -1392,7 +1658,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq 32+8(%rsp),%rsi @@ -1401,7 +1667,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r11,%r10 @@ -1410,7 +1676,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1418,7 +1684,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1426,7 +1692,7 @@ L$8x_reduce: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1444,7 +1710,7 @@ L$8x_reduce: decl %ecx jnz L$8x_reduce - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp xorq %rax,%rax movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp @@ -1470,14 +1736,14 @@ L$8x_reduce: L$8x_tail: mulq %rbx addq %rax,%r8 - movq 16(%rbp),%rax + movq 8(%rbp),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 - movq 32(%rbp),%rax + movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 leaq 8(%rdi),%rdi @@ -1486,7 +1752,7 @@ L$8x_tail: mulq %rbx addq %rax,%r10 - movq 48(%rbp),%rax + movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 @@ -1494,7 +1760,7 @@ L$8x_tail: mulq %rbx addq %rax,%r11 - movq 64(%rbp),%rax + movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 @@ -1502,7 +1768,7 @@ L$8x_tail: mulq %rbx addq %rax,%r12 - movq 80(%rbp),%rax + movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 @@ -1510,7 +1776,7 @@ L$8x_tail: mulq %rbx addq %rax,%r13 - movq 96(%rbp),%rax + movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 @@ -1518,7 +1784,7 @@ L$8x_tail: mulq %rbx addq %rax,%r14 - movq 112(%rbp),%rax + movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 @@ -1536,7 +1802,7 @@ L$8x_tail: decl %ecx jnz L$8x_tail - leaq 128(%rbp),%rbp + leaq 64(%rbp),%rbp movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae L$8x_tail_done @@ -1560,6 +1826,15 @@ L$8x_tail: .p2align 5 L$8x_tail_done: addq (%rdx),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + + xorq %rax,%rax negq %rsi @@ -1573,7 +1848,7 @@ L$8x_no_tail: adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax - movq -16(%rbp),%rcx + movq -8(%rbp),%rcx xorq %rsi,%rsi .byte 102,72,15,126,213 @@ -1591,40 +1866,58 @@ L$8x_no_tail: cmpq %rdx,%rdi jb L$8x_reduction_loop + .byte 0xf3,0xc3 - subq %r15,%rcx - leaq (%rdi,%r9,1),%rbx - adcq %rsi,%rsi - movq %r9,%rcx - orq %rsi,%rax -.byte 102,72,15,126,207 - xorq $1,%rax -.byte 102,72,15,126,206 - leaq (%rbp,%rax,8),%rbp - sarq $3+2,%rcx - jmp L$sqr4x_sub .p2align 5 +__bn_post4x_internal: + movq 0(%rbp),%r12 + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx +.byte 102,72,15,126,207 + negq %rax +.byte 102,72,15,126,206 + sarq $3+2,%rcx + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqr4x_sub_entry + +.p2align 4 L$sqr4x_sub: -.byte 0x66 - movq 0(%rbx),%r12 - movq 8(%rbx),%r13 - sbbq 0(%rbp),%r12 - movq 16(%rbx),%r14 - sbbq 16(%rbp),%r13 - movq 24(%rbx),%r15 - leaq 32(%rbx),%rbx - sbbq 32(%rbp),%r14 + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +L$sqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 movq %r12,0(%rdi) - sbbq 48(%rbp),%r15 - leaq 64(%rbp),%rbp + leaq 32(%rbx),%rbx movq %r13,8(%rdi) + sbbq %r10,%r10 movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz L$sqr4x_sub + movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 @@ -1651,10 +1944,9 @@ bn_from_mont8x: pushq %r13 pushq %r14 pushq %r15 -.byte 0x67 - movl %r9d,%r10d + shll $3,%r9d - shll $3+2,%r10d + leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 @@ -1664,19 +1956,20 @@ bn_from_mont8x: - leaq -64(%rsp,%r9,2),%r11 - subq %rsi,%r11 + + leaq -320(%rsp,%r9,2),%r11 + subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$from_sp_alt subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + leaq -320(%rsp,%r9,2),%rsp jmp L$from_sp_done .p2align 5 L$from_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rsp,%r9,2),%rsp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 @@ -1727,7 +2020,8 @@ L$mul_by_1: .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor %xmm0,%xmm0 leaq 48(%rsp),%rax @@ -1777,46 +2071,170 @@ L$scatter_epilogue: .globl _bn_gather5 .private_extern _bn_gather5 -.p2align 4 +.p2align 5 _bn_gather5: - movl %ecx,%r11d - shrl $3,%ecx - andq $7,%r11 - notl %ecx - leaq L$magic_masks(%rip),%rax - andl $3,%ecx - leaq 128(%rdx,%r11,8),%rdx - movq 0(%rax,%rcx,8),%xmm4 - movq 8(%rax,%rcx,8),%xmm5 - movq 16(%rax,%rcx,8),%xmm6 - movq 24(%rax,%rcx,8),%xmm7 - jmp L$gather -.p2align 4 -L$gather: - movq -128(%rdx),%xmm0 - movq -64(%rdx),%xmm1 - pand %xmm4,%xmm0 - movq 0(%rdx),%xmm2 - pand %xmm5,%xmm1 - movq 64(%rdx),%xmm3 - pand %xmm6,%xmm2 - por %xmm1,%xmm0 - pand %xmm7,%xmm3 -.byte 0x67,0x67 - por %xmm2,%xmm0 - leaq 256(%rdx),%rdx - por %xmm3,%xmm0 +L$SEH_begin_bn_gather5: +.byte 0x4c,0x8d,0x14,0x24 +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq L$inc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) + jmp L$gather + +.p2align 5 +L$gather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 movq %xmm0,(%rdi) leaq 8(%rdi),%rdi subl $1,%esi jnz L$gather + + leaq (%r10),%rsp .byte 0xf3,0xc3 L$SEH_end_bn_gather5: .p2align 6 -L$magic_masks: -.long 0,0, 0,0, 0,0, -1,-1 -.long 0,0, 0,0, 0,0, 0,0 +L$inc: +.long 0,0, 1,1 +.long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S new file mode 100644 index 0000000000..c3554c8d13 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S @@ -0,0 +1,1584 @@ +#if defined(__x86_64__) +.text + + + +.p2align 6 +L$zero: +.long 0,0,0,0 +L$one: +.long 1,0,0,0 +L$inc: +.long 0,1,2,3 +L$four: +.long 4,4,4,4 +L$incy: +.long 0,2,4,6,1,3,5,7 +L$eight: +.long 8,8,8,8,8,8,8,8 +L$rot16: +.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd +L$rot24: +.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +L$sigma: +.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.globl _ChaCha20_ctr32 +.private_extern _ChaCha20_ctr32 + +.p2align 6 +_ChaCha20_ctr32: + cmpq $0,%rdx + je L$no_data + movq _OPENSSL_ia32cap_P+4(%rip),%r10 + testl $512,%r10d + jnz L$ChaCha20_ssse3 + + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $64+24,%rsp + + + movdqu (%rcx),%xmm1 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa L$one(%rip),%xmm4 + + + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + movq %rdx,%rbp + jmp L$oop_outer + +.p2align 5 +L$oop_outer: + movl $0x61707865,%eax + movl $0x3320646e,%ebx + movl $0x79622d32,%ecx + movl $0x6b206574,%edx + movl 16(%rsp),%r8d + movl 20(%rsp),%r9d + movl 24(%rsp),%r10d + movl 28(%rsp),%r11d + movd %xmm3,%r12d + movl 52(%rsp),%r13d + movl 56(%rsp),%r14d + movl 60(%rsp),%r15d + + movq %rbp,64+0(%rsp) + movl $10,%ebp + movq %rsi,64+8(%rsp) +.byte 102,72,15,126,214 + movq %rdi,64+16(%rsp) + movq %rsi,%rdi + shrq $32,%rdi + jmp L$oop + +.p2align 5 +L$oop: + addl %r8d,%eax + xorl %eax,%r12d + roll $16,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $16,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $12,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $12,%r9d + addl %r8d,%eax + xorl %eax,%r12d + roll $8,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $8,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $7,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $7,%r9d + movl %esi,32(%rsp) + movl %edi,36(%rsp) + movl 40(%rsp),%esi + movl 44(%rsp),%edi + addl %r10d,%ecx + xorl %ecx,%r14d + roll $16,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $16,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $12,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $12,%r11d + addl %r10d,%ecx + xorl %ecx,%r14d + roll $8,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $8,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $7,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $7,%r11d + addl %r9d,%eax + xorl %eax,%r15d + roll $16,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $16,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $12,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $12,%r10d + addl %r9d,%eax + xorl %eax,%r15d + roll $8,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $8,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $7,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $7,%r10d + movl %esi,40(%rsp) + movl %edi,44(%rsp) + movl 32(%rsp),%esi + movl 36(%rsp),%edi + addl %r11d,%ecx + xorl %ecx,%r13d + roll $16,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $16,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $12,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $12,%r8d + addl %r11d,%ecx + xorl %ecx,%r13d + roll $8,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $8,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $7,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $7,%r8d + decl %ebp + jnz L$oop + movl %edi,36(%rsp) + movl %esi,32(%rsp) + movq 64(%rsp),%rbp + movdqa %xmm2,%xmm1 + movq 64+8(%rsp),%rsi + paddd %xmm4,%xmm3 + movq 64+16(%rsp),%rdi + + addl $0x61707865,%eax + addl $0x3320646e,%ebx + addl $0x79622d32,%ecx + addl $0x6b206574,%edx + addl 16(%rsp),%r8d + addl 20(%rsp),%r9d + addl 24(%rsp),%r10d + addl 28(%rsp),%r11d + addl 48(%rsp),%r12d + addl 52(%rsp),%r13d + addl 56(%rsp),%r14d + addl 60(%rsp),%r15d + paddd 32(%rsp),%xmm1 + + cmpq $64,%rbp + jb L$tail + + xorl 0(%rsi),%eax + xorl 4(%rsi),%ebx + xorl 8(%rsi),%ecx + xorl 12(%rsi),%edx + xorl 16(%rsi),%r8d + xorl 20(%rsi),%r9d + xorl 24(%rsi),%r10d + xorl 28(%rsi),%r11d + movdqu 32(%rsi),%xmm0 + xorl 48(%rsi),%r12d + xorl 52(%rsi),%r13d + xorl 56(%rsi),%r14d + xorl 60(%rsi),%r15d + leaq 64(%rsi),%rsi + pxor %xmm1,%xmm0 + + movdqa %xmm2,32(%rsp) + movd %xmm3,48(%rsp) + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + movdqu %xmm0,32(%rdi) + movl %r12d,48(%rdi) + movl %r13d,52(%rdi) + movl %r14d,56(%rdi) + movl %r15d,60(%rdi) + leaq 64(%rdi),%rdi + + subq $64,%rbp + jnz L$oop_outer + + jmp L$done + +.p2align 4 +L$tail: + movl %eax,0(%rsp) + movl %ebx,4(%rsp) + xorq %rbx,%rbx + movl %ecx,8(%rsp) + movl %edx,12(%rsp) + movl %r8d,16(%rsp) + movl %r9d,20(%rsp) + movl %r10d,24(%rsp) + movl %r11d,28(%rsp) + movdqa %xmm1,32(%rsp) + movl %r12d,48(%rsp) + movl %r13d,52(%rsp) + movl %r14d,56(%rsp) + movl %r15d,60(%rsp) + +L$oop_tail: + movzbl (%rsi,%rbx,1),%eax + movzbl (%rsp,%rbx,1),%edx + leaq 1(%rbx),%rbx + xorl %edx,%eax + movb %al,-1(%rdi,%rbx,1) + decq %rbp + jnz L$oop_tail + +L$done: + addq $64+24,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx +L$no_data: + .byte 0xf3,0xc3 + + +.p2align 5 +ChaCha20_ssse3: +L$ChaCha20_ssse3: + cmpq $128,%rdx + ja L$ChaCha20_4x + +L$do_sse3_after_all: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + subq $64+24,%rsp + movdqa L$sigma(%rip),%xmm0 + movdqu (%rcx),%xmm1 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa L$rot16(%rip),%xmm6 + movdqa L$rot24(%rip),%xmm7 + + movdqa %xmm0,0(%rsp) + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + movl $10,%ebp + jmp L$oop_ssse3 + +.p2align 5 +L$oop_outer_ssse3: + movdqa L$one(%rip),%xmm3 + movdqa 0(%rsp),%xmm0 + movdqa 16(%rsp),%xmm1 + movdqa 32(%rsp),%xmm2 + paddd 48(%rsp),%xmm3 + movl $10,%ebp + movdqa %xmm3,48(%rsp) + jmp L$oop_ssse3 + +.p2align 5 +L$oop_ssse3: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,222 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 +.byte 102,15,56,0,223 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decl %ebp + jnz L$oop_ssse3 + paddd 0(%rsp),%xmm0 + paddd 16(%rsp),%xmm1 + paddd 32(%rsp),%xmm2 + paddd 48(%rsp),%xmm3 + + cmpq $64,%rdx + jb L$tail_ssse3 + + movdqu 0(%rsi),%xmm4 + movdqu 16(%rsi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%rsi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%rsi),%xmm5 + leaq 64(%rsi),%rsi + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + + movdqu %xmm0,0(%rdi) + movdqu %xmm1,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + leaq 64(%rdi),%rdi + + subq $64,%rdx + jnz L$oop_outer_ssse3 + + jmp L$done_ssse3 + +.p2align 4 +L$tail_ssse3: + movdqa %xmm0,0(%rsp) + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + xorq %rbx,%rbx + +L$oop_tail_ssse3: + movzbl (%rsi,%rbx,1),%eax + movzbl (%rsp,%rbx,1),%ecx + leaq 1(%rbx),%rbx + xorl %ecx,%eax + movb %al,-1(%rdi,%rbx,1) + decq %rdx + jnz L$oop_tail_ssse3 + +L$done_ssse3: + addq $64+24,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + .byte 0xf3,0xc3 + + +.p2align 5 +ChaCha20_4x: +L$ChaCha20_4x: + movq %r10,%r11 + shrq $32,%r10 + testq $32,%r10 + jnz L$ChaCha20_8x + cmpq $192,%rdx + ja L$proceed4x + + andq $71303168,%r11 + cmpq $4194304,%r11 + je L$do_sse3_after_all + +L$proceed4x: + leaq -120(%rsp),%r11 + subq $0x148+0,%rsp + movdqa L$sigma(%rip),%xmm11 + movdqu (%rcx),%xmm15 + movdqu 16(%rcx),%xmm7 + movdqu (%r8),%xmm3 + leaq 256(%rsp),%rcx + leaq L$rot16(%rip),%r10 + leaq L$rot24(%rip),%r11 + + pshufd $0x00,%xmm11,%xmm8 + pshufd $0x55,%xmm11,%xmm9 + movdqa %xmm8,64(%rsp) + pshufd $0xaa,%xmm11,%xmm10 + movdqa %xmm9,80(%rsp) + pshufd $0xff,%xmm11,%xmm11 + movdqa %xmm10,96(%rsp) + movdqa %xmm11,112(%rsp) + + pshufd $0x00,%xmm15,%xmm12 + pshufd $0x55,%xmm15,%xmm13 + movdqa %xmm12,128-256(%rcx) + pshufd $0xaa,%xmm15,%xmm14 + movdqa %xmm13,144-256(%rcx) + pshufd $0xff,%xmm15,%xmm15 + movdqa %xmm14,160-256(%rcx) + movdqa %xmm15,176-256(%rcx) + + pshufd $0x00,%xmm7,%xmm4 + pshufd $0x55,%xmm7,%xmm5 + movdqa %xmm4,192-256(%rcx) + pshufd $0xaa,%xmm7,%xmm6 + movdqa %xmm5,208-256(%rcx) + pshufd $0xff,%xmm7,%xmm7 + movdqa %xmm6,224-256(%rcx) + movdqa %xmm7,240-256(%rcx) + + pshufd $0x00,%xmm3,%xmm0 + pshufd $0x55,%xmm3,%xmm1 + paddd L$inc(%rip),%xmm0 + pshufd $0xaa,%xmm3,%xmm2 + movdqa %xmm1,272-256(%rcx) + pshufd $0xff,%xmm3,%xmm3 + movdqa %xmm2,288-256(%rcx) + movdqa %xmm3,304-256(%rcx) + + jmp L$oop_enter4x + +.p2align 5 +L$oop_outer4x: + movdqa 64(%rsp),%xmm8 + movdqa 80(%rsp),%xmm9 + movdqa 96(%rsp),%xmm10 + movdqa 112(%rsp),%xmm11 + movdqa 128-256(%rcx),%xmm12 + movdqa 144-256(%rcx),%xmm13 + movdqa 160-256(%rcx),%xmm14 + movdqa 176-256(%rcx),%xmm15 + movdqa 192-256(%rcx),%xmm4 + movdqa 208-256(%rcx),%xmm5 + movdqa 224-256(%rcx),%xmm6 + movdqa 240-256(%rcx),%xmm7 + movdqa 256-256(%rcx),%xmm0 + movdqa 272-256(%rcx),%xmm1 + movdqa 288-256(%rcx),%xmm2 + movdqa 304-256(%rcx),%xmm3 + paddd L$four(%rip),%xmm0 + +L$oop_enter4x: + movdqa %xmm6,32(%rsp) + movdqa %xmm7,48(%rsp) + movdqa (%r10),%xmm7 + movl $10,%eax + movdqa %xmm0,256-256(%rcx) + jmp L$oop4x + +.p2align 5 +L$oop4x: + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 +.byte 102,15,56,0,199 +.byte 102,15,56,0,207 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm6 + pslld $12,%xmm12 + psrld $20,%xmm6 + movdqa %xmm13,%xmm7 + pslld $12,%xmm13 + por %xmm6,%xmm12 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm13 + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 +.byte 102,15,56,0,198 +.byte 102,15,56,0,206 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm7 + pslld $7,%xmm12 + psrld $25,%xmm7 + movdqa %xmm13,%xmm6 + pslld $7,%xmm13 + por %xmm7,%xmm12 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm13 + movdqa %xmm4,0(%rsp) + movdqa %xmm5,16(%rsp) + movdqa 32(%rsp),%xmm4 + movdqa 48(%rsp),%xmm5 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 +.byte 102,15,56,0,215 +.byte 102,15,56,0,223 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm6 + pslld $12,%xmm14 + psrld $20,%xmm6 + movdqa %xmm15,%xmm7 + pslld $12,%xmm15 + por %xmm6,%xmm14 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm15 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 +.byte 102,15,56,0,214 +.byte 102,15,56,0,222 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm7 + pslld $7,%xmm14 + psrld $25,%xmm7 + movdqa %xmm15,%xmm6 + pslld $7,%xmm15 + por %xmm7,%xmm14 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm15 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 +.byte 102,15,56,0,223 +.byte 102,15,56,0,199 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm6 + pslld $12,%xmm13 + psrld $20,%xmm6 + movdqa %xmm14,%xmm7 + pslld $12,%xmm14 + por %xmm6,%xmm13 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm14 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 +.byte 102,15,56,0,222 +.byte 102,15,56,0,198 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm7 + pslld $7,%xmm13 + psrld $25,%xmm7 + movdqa %xmm14,%xmm6 + pslld $7,%xmm14 + por %xmm7,%xmm13 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm14 + movdqa %xmm4,32(%rsp) + movdqa %xmm5,48(%rsp) + movdqa 0(%rsp),%xmm4 + movdqa 16(%rsp),%xmm5 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 +.byte 102,15,56,0,207 +.byte 102,15,56,0,215 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm6 + pslld $12,%xmm15 + psrld $20,%xmm6 + movdqa %xmm12,%xmm7 + pslld $12,%xmm12 + por %xmm6,%xmm15 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm12 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 +.byte 102,15,56,0,206 +.byte 102,15,56,0,214 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm7 + pslld $7,%xmm15 + psrld $25,%xmm7 + movdqa %xmm12,%xmm6 + pslld $7,%xmm12 + por %xmm7,%xmm15 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm12 + decl %eax + jnz L$oop4x + + paddd 64(%rsp),%xmm8 + paddd 80(%rsp),%xmm9 + paddd 96(%rsp),%xmm10 + paddd 112(%rsp),%xmm11 + + movdqa %xmm8,%xmm6 + punpckldq %xmm9,%xmm8 + movdqa %xmm10,%xmm7 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm9,%xmm6 + punpckhdq %xmm11,%xmm7 + movdqa %xmm8,%xmm9 + punpcklqdq %xmm10,%xmm8 + movdqa %xmm6,%xmm11 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm10,%xmm9 + punpckhqdq %xmm7,%xmm11 + paddd 128-256(%rcx),%xmm12 + paddd 144-256(%rcx),%xmm13 + paddd 160-256(%rcx),%xmm14 + paddd 176-256(%rcx),%xmm15 + + movdqa %xmm8,0(%rsp) + movdqa %xmm9,16(%rsp) + movdqa 32(%rsp),%xmm8 + movdqa 48(%rsp),%xmm9 + + movdqa %xmm12,%xmm10 + punpckldq %xmm13,%xmm12 + movdqa %xmm14,%xmm7 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm13,%xmm10 + punpckhdq %xmm15,%xmm7 + movdqa %xmm12,%xmm13 + punpcklqdq %xmm14,%xmm12 + movdqa %xmm10,%xmm15 + punpcklqdq %xmm7,%xmm10 + punpckhqdq %xmm14,%xmm13 + punpckhqdq %xmm7,%xmm15 + paddd 192-256(%rcx),%xmm4 + paddd 208-256(%rcx),%xmm5 + paddd 224-256(%rcx),%xmm8 + paddd 240-256(%rcx),%xmm9 + + movdqa %xmm6,32(%rsp) + movdqa %xmm11,48(%rsp) + + movdqa %xmm4,%xmm14 + punpckldq %xmm5,%xmm4 + movdqa %xmm8,%xmm7 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm5,%xmm14 + punpckhdq %xmm9,%xmm7 + movdqa %xmm4,%xmm5 + punpcklqdq %xmm8,%xmm4 + movdqa %xmm14,%xmm9 + punpcklqdq %xmm7,%xmm14 + punpckhqdq %xmm8,%xmm5 + punpckhqdq %xmm7,%xmm9 + paddd 256-256(%rcx),%xmm0 + paddd 272-256(%rcx),%xmm1 + paddd 288-256(%rcx),%xmm2 + paddd 304-256(%rcx),%xmm3 + + movdqa %xmm0,%xmm8 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm8 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm8,%xmm3 + punpcklqdq %xmm7,%xmm8 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + cmpq $256,%rdx + jb L$tail4x + + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 48(%rsp),%xmm6 + pxor %xmm15,%xmm11 + pxor %xmm9,%xmm2 + pxor %xmm3,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + + subq $256,%rdx + jnz L$oop_outer4x + + jmp L$done4x + +L$tail4x: + cmpq $192,%rdx + jae L$192_or_more4x + cmpq $128,%rdx + jae L$128_or_more4x + cmpq $64,%rdx + jae L$64_or_more4x + + + xorq %r10,%r10 + + movdqa %xmm12,16(%rsp) + movdqa %xmm4,32(%rsp) + movdqa %xmm0,48(%rsp) + jmp L$oop_tail4x + +.p2align 5 +L$64_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je L$done4x + + movdqa 16(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm13,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm5,32(%rsp) + subq $64,%rdx + movdqa %xmm1,48(%rsp) + jmp L$oop_tail4x + +.p2align 5 +L$128_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + je L$done4x + + movdqa 32(%rsp),%xmm6 + leaq 128(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm10,16(%rsp) + leaq 128(%rdi),%rdi + movdqa %xmm14,32(%rsp) + subq $128,%rdx + movdqa %xmm8,48(%rsp) + jmp L$oop_tail4x + +.p2align 5 +L$192_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je L$done4x + + movdqa 48(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm15,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm9,32(%rsp) + subq $192,%rdx + movdqa %xmm3,48(%rsp) + +L$oop_tail4x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz L$oop_tail4x + +L$done4x: + addq $0x148+0,%rsp + .byte 0xf3,0xc3 + + +.p2align 5 +ChaCha20_8x: +L$ChaCha20_8x: + movq %rsp,%r10 + subq $0x280+8,%rsp + andq $-32,%rsp + vzeroupper + movq %r10,640(%rsp) + + + + + + + + + + + vbroadcasti128 L$sigma(%rip),%ymm11 + vbroadcasti128 (%rcx),%ymm3 + vbroadcasti128 16(%rcx),%ymm15 + vbroadcasti128 (%r8),%ymm7 + leaq 256(%rsp),%rcx + leaq 512(%rsp),%rax + leaq L$rot16(%rip),%r10 + leaq L$rot24(%rip),%r11 + + vpshufd $0x00,%ymm11,%ymm8 + vpshufd $0x55,%ymm11,%ymm9 + vmovdqa %ymm8,128-256(%rcx) + vpshufd $0xaa,%ymm11,%ymm10 + vmovdqa %ymm9,160-256(%rcx) + vpshufd $0xff,%ymm11,%ymm11 + vmovdqa %ymm10,192-256(%rcx) + vmovdqa %ymm11,224-256(%rcx) + + vpshufd $0x00,%ymm3,%ymm0 + vpshufd $0x55,%ymm3,%ymm1 + vmovdqa %ymm0,256-256(%rcx) + vpshufd $0xaa,%ymm3,%ymm2 + vmovdqa %ymm1,288-256(%rcx) + vpshufd $0xff,%ymm3,%ymm3 + vmovdqa %ymm2,320-256(%rcx) + vmovdqa %ymm3,352-256(%rcx) + + vpshufd $0x00,%ymm15,%ymm12 + vpshufd $0x55,%ymm15,%ymm13 + vmovdqa %ymm12,384-512(%rax) + vpshufd $0xaa,%ymm15,%ymm14 + vmovdqa %ymm13,416-512(%rax) + vpshufd $0xff,%ymm15,%ymm15 + vmovdqa %ymm14,448-512(%rax) + vmovdqa %ymm15,480-512(%rax) + + vpshufd $0x00,%ymm7,%ymm4 + vpshufd $0x55,%ymm7,%ymm5 + vpaddd L$incy(%rip),%ymm4,%ymm4 + vpshufd $0xaa,%ymm7,%ymm6 + vmovdqa %ymm5,544-512(%rax) + vpshufd $0xff,%ymm7,%ymm7 + vmovdqa %ymm6,576-512(%rax) + vmovdqa %ymm7,608-512(%rax) + + jmp L$oop_enter8x + +.p2align 5 +L$oop_outer8x: + vmovdqa 128-256(%rcx),%ymm8 + vmovdqa 160-256(%rcx),%ymm9 + vmovdqa 192-256(%rcx),%ymm10 + vmovdqa 224-256(%rcx),%ymm11 + vmovdqa 256-256(%rcx),%ymm0 + vmovdqa 288-256(%rcx),%ymm1 + vmovdqa 320-256(%rcx),%ymm2 + vmovdqa 352-256(%rcx),%ymm3 + vmovdqa 384-512(%rax),%ymm12 + vmovdqa 416-512(%rax),%ymm13 + vmovdqa 448-512(%rax),%ymm14 + vmovdqa 480-512(%rax),%ymm15 + vmovdqa 512-512(%rax),%ymm4 + vmovdqa 544-512(%rax),%ymm5 + vmovdqa 576-512(%rax),%ymm6 + vmovdqa 608-512(%rax),%ymm7 + vpaddd L$eight(%rip),%ymm4,%ymm4 + +L$oop_enter8x: + vmovdqa %ymm14,64(%rsp) + vmovdqa %ymm15,96(%rsp) + vbroadcasti128 (%r10),%ymm15 + vmovdqa %ymm4,512-512(%rax) + movl $10,%eax + jmp L$oop8x + +.p2align 5 +L$oop8x: + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $12,%ymm0,%ymm14 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $12,%ymm1,%ymm15 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $7,%ymm0,%ymm15 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $7,%ymm1,%ymm14 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vmovdqa %ymm12,0(%rsp) + vmovdqa %ymm13,32(%rsp) + vmovdqa 64(%rsp),%ymm12 + vmovdqa 96(%rsp),%ymm13 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $12,%ymm2,%ymm14 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $12,%ymm3,%ymm15 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $7,%ymm2,%ymm15 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $7,%ymm3,%ymm14 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $12,%ymm1,%ymm14 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $12,%ymm2,%ymm15 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $7,%ymm1,%ymm15 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $7,%ymm2,%ymm14 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vmovdqa %ymm12,64(%rsp) + vmovdqa %ymm13,96(%rsp) + vmovdqa 0(%rsp),%ymm12 + vmovdqa 32(%rsp),%ymm13 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $12,%ymm3,%ymm14 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $12,%ymm0,%ymm15 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $7,%ymm3,%ymm15 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $7,%ymm0,%ymm14 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + decl %eax + jnz L$oop8x + + leaq 512(%rsp),%rax + vpaddd 128-256(%rcx),%ymm8,%ymm8 + vpaddd 160-256(%rcx),%ymm9,%ymm9 + vpaddd 192-256(%rcx),%ymm10,%ymm10 + vpaddd 224-256(%rcx),%ymm11,%ymm11 + + vpunpckldq %ymm9,%ymm8,%ymm14 + vpunpckldq %ymm11,%ymm10,%ymm15 + vpunpckhdq %ymm9,%ymm8,%ymm8 + vpunpckhdq %ymm11,%ymm10,%ymm10 + vpunpcklqdq %ymm15,%ymm14,%ymm9 + vpunpckhqdq %ymm15,%ymm14,%ymm14 + vpunpcklqdq %ymm10,%ymm8,%ymm11 + vpunpckhqdq %ymm10,%ymm8,%ymm8 + vpaddd 256-256(%rcx),%ymm0,%ymm0 + vpaddd 288-256(%rcx),%ymm1,%ymm1 + vpaddd 320-256(%rcx),%ymm2,%ymm2 + vpaddd 352-256(%rcx),%ymm3,%ymm3 + + vpunpckldq %ymm1,%ymm0,%ymm10 + vpunpckldq %ymm3,%ymm2,%ymm15 + vpunpckhdq %ymm1,%ymm0,%ymm0 + vpunpckhdq %ymm3,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm10,%ymm1 + vpunpckhqdq %ymm15,%ymm10,%ymm10 + vpunpcklqdq %ymm2,%ymm0,%ymm3 + vpunpckhqdq %ymm2,%ymm0,%ymm0 + vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 + vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 + vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 + vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 + vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 + vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 + vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 + vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 + vmovdqa %ymm15,0(%rsp) + vmovdqa %ymm9,32(%rsp) + vmovdqa 64(%rsp),%ymm15 + vmovdqa 96(%rsp),%ymm9 + + vpaddd 384-512(%rax),%ymm12,%ymm12 + vpaddd 416-512(%rax),%ymm13,%ymm13 + vpaddd 448-512(%rax),%ymm15,%ymm15 + vpaddd 480-512(%rax),%ymm9,%ymm9 + + vpunpckldq %ymm13,%ymm12,%ymm2 + vpunpckldq %ymm9,%ymm15,%ymm8 + vpunpckhdq %ymm13,%ymm12,%ymm12 + vpunpckhdq %ymm9,%ymm15,%ymm15 + vpunpcklqdq %ymm8,%ymm2,%ymm13 + vpunpckhqdq %ymm8,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm12,%ymm9 + vpunpckhqdq %ymm15,%ymm12,%ymm12 + vpaddd 512-512(%rax),%ymm4,%ymm4 + vpaddd 544-512(%rax),%ymm5,%ymm5 + vpaddd 576-512(%rax),%ymm6,%ymm6 + vpaddd 608-512(%rax),%ymm7,%ymm7 + + vpunpckldq %ymm5,%ymm4,%ymm15 + vpunpckldq %ymm7,%ymm6,%ymm8 + vpunpckhdq %ymm5,%ymm4,%ymm4 + vpunpckhdq %ymm7,%ymm6,%ymm6 + vpunpcklqdq %ymm8,%ymm15,%ymm5 + vpunpckhqdq %ymm8,%ymm15,%ymm15 + vpunpcklqdq %ymm6,%ymm4,%ymm7 + vpunpckhqdq %ymm6,%ymm4,%ymm4 + vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 + vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 + vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 + vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 + vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 + vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 + vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 + vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 + vmovdqa 0(%rsp),%ymm6 + vmovdqa 32(%rsp),%ymm12 + + cmpq $512,%rdx + jb L$tail8x + + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + leaq 128(%rsi),%rsi + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm12,%ymm12 + vpxor 32(%rsi),%ymm13,%ymm13 + vpxor 64(%rsi),%ymm10,%ymm10 + vpxor 96(%rsi),%ymm15,%ymm15 + leaq 128(%rsi),%rsi + vmovdqu %ymm12,0(%rdi) + vmovdqu %ymm13,32(%rdi) + vmovdqu %ymm10,64(%rdi) + vmovdqu %ymm15,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm14,%ymm14 + vpxor 32(%rsi),%ymm2,%ymm2 + vpxor 64(%rsi),%ymm3,%ymm3 + vpxor 96(%rsi),%ymm7,%ymm7 + leaq 128(%rsi),%rsi + vmovdqu %ymm14,0(%rdi) + vmovdqu %ymm2,32(%rdi) + vmovdqu %ymm3,64(%rdi) + vmovdqu %ymm7,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm11,%ymm11 + vpxor 32(%rsi),%ymm9,%ymm9 + vpxor 64(%rsi),%ymm0,%ymm0 + vpxor 96(%rsi),%ymm4,%ymm4 + leaq 128(%rsi),%rsi + vmovdqu %ymm11,0(%rdi) + vmovdqu %ymm9,32(%rdi) + vmovdqu %ymm0,64(%rdi) + vmovdqu %ymm4,96(%rdi) + leaq 128(%rdi),%rdi + + subq $512,%rdx + jnz L$oop_outer8x + + jmp L$done8x + +L$tail8x: + cmpq $448,%rdx + jae L$448_or_more8x + cmpq $384,%rdx + jae L$384_or_more8x + cmpq $320,%rdx + jae L$320_or_more8x + cmpq $256,%rdx + jae L$256_or_more8x + cmpq $192,%rdx + jae L$192_or_more8x + cmpq $128,%rdx + jae L$128_or_more8x + cmpq $64,%rdx + jae L$64_or_more8x + + xorq %r10,%r10 + vmovdqa %ymm6,0(%rsp) + vmovdqa %ymm8,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$64_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + je L$done8x + + leaq 64(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm1,0(%rsp) + leaq 64(%rdi),%rdi + subq $64,%rdx + vmovdqa %ymm5,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$128_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + je L$done8x + + leaq 128(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm12,0(%rsp) + leaq 128(%rdi),%rdi + subq $128,%rdx + vmovdqa %ymm13,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$192_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + je L$done8x + + leaq 192(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm10,0(%rsp) + leaq 192(%rdi),%rdi + subq $192,%rdx + vmovdqa %ymm15,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$256_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + je L$done8x + + leaq 256(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm14,0(%rsp) + leaq 256(%rdi),%rdi + subq $256,%rdx + vmovdqa %ymm2,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$320_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + je L$done8x + + leaq 320(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm3,0(%rsp) + leaq 320(%rdi),%rdi + subq $320,%rdx + vmovdqa %ymm7,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$384_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + je L$done8x + + leaq 384(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm11,0(%rsp) + leaq 384(%rdi),%rdi + subq $384,%rdx + vmovdqa %ymm9,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$448_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vpxor 384(%rsi),%ymm11,%ymm11 + vpxor 416(%rsi),%ymm9,%ymm9 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + vmovdqu %ymm11,384(%rdi) + vmovdqu %ymm9,416(%rdi) + je L$done8x + + leaq 448(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm0,0(%rsp) + leaq 448(%rdi),%rdi + subq $448,%rdx + vmovdqa %ymm4,32(%rsp) + +L$oop_tail8x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz L$oop_tail8x + +L$done8x: + vzeroall + movq 640(%rsp),%rsp + .byte 0xf3,0xc3 + +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/ec/p256-x86_64-asm.S b/packager/third_party/boringssl/mac-x86_64/crypto/ec/p256-x86_64-asm.S new file mode 100644 index 0000000000..1cd0cc3f5c --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/ec/p256-x86_64-asm.S @@ -0,0 +1,1788 @@ +#if defined(__x86_64__) +.text + + + +.p2align 6 +L$poly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + +L$One: +.long 1,1,1,1,1,1,1,1 +L$Two: +.long 2,2,2,2,2,2,2,2 +L$Three: +.long 3,3,3,3,3,3,3,3 +L$ONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + + +.p2align 6 +ecp_nistz256_mul_by_2: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + addq %r8,%r8 + movq 16(%rsi),%r10 + adcq %r9,%r9 + movq 24(%rsi),%r11 + leaq L$poly(%rip),%rsi + movq %r8,%rax + adcq %r10,%r10 + adcq %r11,%r11 + movq %r9,%rdx + sbbq %r13,%r13 + + subq 0(%rsi),%r8 + movq %r10,%rcx + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r12 + sbbq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 + + + + +.globl _ecp_nistz256_neg +.private_extern _ecp_nistz256_neg + +.p2align 5 +_ecp_nistz256_neg: + pushq %r12 + pushq %r13 + + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r13,%r13 + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r8,%rax + sbbq 24(%rsi),%r11 + leaq L$poly(%rip),%rsi + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 + + + + + + + +.globl _ecp_nistz256_mul_mont +.private_extern _ecp_nistz256_mul_mont + +.p2align 5 +_ecp_nistz256_mul_mont: +L$mul_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rdx,%rbx + movq 0(%rdx),%rax + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + call __ecp_nistz256_mul_montq +L$mul_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + + + +.p2align 5 +__ecp_nistz256_mul_montq: + + + movq %rax,%rbp + mulq %r9 + movq L$poly+8(%rip),%r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r10 + movq L$poly+24(%rip),%r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r11 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + xorq %r13,%r13 + movq %rdx,%r12 + + + + + + + + + + + movq %r8,%rbp + shlq $32,%r8 + mulq %r15 + shrq $32,%rbp + addq %r8,%r9 + adcq %rbp,%r10 + adcq %rax,%r11 + movq 8(%rbx),%rax + adcq %rdx,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + + movq %r9,%rbp + shlq $32,%r9 + mulq %r15 + shrq $32,%rbp + addq %r9,%r10 + adcq %rbp,%r11 + adcq %rax,%r12 + movq 16(%rbx),%rax + adcq %rdx,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + + movq %r10,%rbp + shlq $32,%r10 + mulq %r15 + shrq $32,%rbp + addq %r10,%r11 + adcq %rbp,%r12 + adcq %rax,%r13 + movq 24(%rbx),%rax + adcq %rdx,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + + movq %r11,%rbp + shlq $32,%r11 + mulq %r15 + shrq $32,%rbp + addq %r11,%r12 + adcq %rbp,%r13 + movq %r12,%rcx + adcq %rax,%r8 + adcq %rdx,%r9 + movq %r13,%rbp + adcq $0,%r10 + + + + subq $-1,%r12 + movq %r8,%rbx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rdx + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rcx,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rbx,%r8 + movq %r13,8(%rdi) + cmovcq %rdx,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + + + + + + + +.globl _ecp_nistz256_sqr_mont +.private_extern _ecp_nistz256_sqr_mont + +.p2align 5 +_ecp_nistz256_sqr_mont: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq 0(%rsi),%rax + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + + call __ecp_nistz256_sqr_montq +L$sqr_mont_done: + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + + + +.p2align 5 +__ecp_nistz256_sqr_montq: + movq %rax,%r13 + mulq %r14 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + + mulq %r13 + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r13 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %r14 + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + + mulq %r15 + xorq %r15,%r15 + addq %rax,%r13 + movq 0(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + mulq %rax + movq %rax,%r8 + movq 8(%rsi),%rax + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r9 + adcq %rax,%r10 + movq 16(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r11 + adcq %rax,%r12 + movq 24(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r13 + adcq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r15 + + movq L$poly+8(%rip),%rsi + movq L$poly+24(%rip),%rbp + + + + + movq %r8,%rcx + shlq $32,%r8 + mulq %rbp + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %rbp + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %rbp + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %rbp + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + adcq %rax,%r10 + adcq $0,%rdx + xorq %r11,%r11 + + + + addq %r8,%r12 + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %rdx,%r15 + movq %r13,%r9 + adcq $0,%r11 + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%rcx + sbbq %rbp,%r15 + sbbq $0,%r11 + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %rcx,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 + + + + + + + +.globl _ecp_nistz256_from_mont +.private_extern _ecp_nistz256_from_mont + +.p2align 5 +_ecp_nistz256_from_mont: + pushq %r12 + pushq %r13 + + movq 0(%rsi),%rax + movq L$poly+24(%rip),%r13 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + movq %rax,%r8 + movq L$poly+8(%rip),%r12 + + + + movq %rax,%rcx + shlq $32,%r8 + mulq %r13 + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %r13 + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %r13 + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %r13 + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + movq %r8,%rcx + adcq %rax,%r10 + movq %r9,%rsi + adcq $0,%rdx + + subq $-1,%r8 + movq %r10,%rax + sbbq %r12,%r9 + sbbq $0,%r10 + movq %rdx,%r11 + sbbq %r13,%rdx + sbbq %r13,%r13 + + cmovnzq %rcx,%r8 + cmovnzq %rsi,%r9 + movq %r8,0(%rdi) + cmovnzq %rax,%r10 + movq %r9,8(%rdi) + cmovzq %rdx,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + popq %r13 + popq %r12 + .byte 0xf3,0xc3 + + + +.globl _ecp_nistz256_select_w5 +.private_extern _ecp_nistz256_select_w5 + +.p2align 5 +_ecp_nistz256_select_w5: + movdqa L$One(%rip),%xmm0 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + + movdqa %xmm0,%xmm8 + pshufd $0,%xmm1,%xmm1 + + movq $16,%rax +L$select_loop_sse_w5: + + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + pcmpeqd %xmm1,%xmm15 + + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + movdqa 64(%rsi),%xmm13 + movdqa 80(%rsi),%xmm14 + leaq 96(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + pand %xmm15,%xmm13 + por %xmm12,%xmm5 + pand %xmm15,%xmm14 + por %xmm13,%xmm6 + por %xmm14,%xmm7 + + decq %rax + jnz L$select_loop_sse_w5 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + movdqu %xmm6,64(%rdi) + movdqu %xmm7,80(%rdi) + .byte 0xf3,0xc3 + + + + +.globl _ecp_nistz256_select_w7 +.private_extern _ecp_nistz256_select_w7 + +.p2align 5 +_ecp_nistz256_select_w7: + movdqa L$One(%rip),%xmm8 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + movdqa %xmm8,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq $64,%rax + +L$select_loop_sse_w7: + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + pcmpeqd %xmm1,%xmm15 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 + + decq %rax + jnz L$select_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + .byte 0xf3,0xc3 + +.globl _ecp_nistz256_avx2_select_w7 +.private_extern _ecp_nistz256_avx2_select_w7 + +.p2align 5 +_ecp_nistz256_avx2_select_w7: +.byte 0x0f,0x0b + .byte 0xf3,0xc3 + + +.p2align 5 +__ecp_nistz256_add_toq: + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + +.p2align 5 +__ecp_nistz256_sub_fromq: + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + + + +.p2align 5 +__ecp_nistz256_subq: + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + .byte 0xf3,0xc3 + + + +.p2align 5 +__ecp_nistz256_mul_by_2q: + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 + +.globl _ecp_nistz256_point_double +.private_extern _ecp_nistz256_point_double + +.p2align 5 +_ecp_nistz256_point_double: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $160+8,%rsp + +L$point_double_shortcutq: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq L$poly+8(%rip),%r14 + movq L$poly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + addq $160+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + +.globl _ecp_nistz256_point_add +.private_extern _ecp_nistz256_point_add + +.p2align 5 +_ecp_nistz256_point_add: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $576+8,%rsp + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm3,%xmm5 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 0x3e + jnz L$add_proceedq +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + testq %r8,%r8 + jnz L$add_proceedq + testq %r9,%r9 + jz L$add_doubleq + +.byte 102,72,15,126,199 + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp L$add_doneq + +.p2align 5 +L$add_doubleq: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp + jmp L$point_double_shortcutq + +.p2align 5 +L$add_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +L$add_doneq: + addq $576+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + +.globl _ecp_nistz256_point_add_affine +.private_extern _ecp_nistz256_point_add_affine + +.p2align 5 +_ecp_nistz256_point_add_affine: + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $480+8,%rsp + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + por %xmm0,%xmm1 + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + por %xmm2,%xmm3 + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm1,%xmm3 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm3,%xmm5 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + movq 0(%rsi),%rax + cmovzq %rbp,%r13 + movq 8(%rsi),%rbp + cmovzq %rcx,%r8 + movq 16(%rsi),%rcx + cmovzq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand L$ONE_mont(%rip),%xmm2 + pand L$ONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + addq $480+8,%rsp + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + .byte 0xf3,0xc3 + +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/md5/md5-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/md5/md5-x86_64.S index 1e61479bc4..16fd2ccef8 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/md5/md5-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/md5/md5-x86_64.S @@ -495,14 +495,14 @@ L$loop: movl %ecx,%r11d addl %ecx,%ebx movl 0(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d xorl %edx,%r11d leal -198630844(%rax,%r10,1),%eax orl %ebx,%r11d xorl %ecx,%r11d addl %r11d,%eax movl 28(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -511,7 +511,7 @@ L$loop: xorl %ebx,%r11d addl %r11d,%edx movl 56(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -520,7 +520,7 @@ L$loop: xorl %eax,%r11d addl %r11d,%ecx movl 20(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -529,7 +529,7 @@ L$loop: xorl %edx,%r11d addl %r11d,%ebx movl 48(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx @@ -538,7 +538,7 @@ L$loop: xorl %ecx,%r11d addl %r11d,%eax movl 12(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -547,7 +547,7 @@ L$loop: xorl %ebx,%r11d addl %r11d,%edx movl 40(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -556,7 +556,7 @@ L$loop: xorl %eax,%r11d addl %r11d,%ecx movl 4(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -565,7 +565,7 @@ L$loop: xorl %edx,%r11d addl %r11d,%ebx movl 32(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx @@ -574,7 +574,7 @@ L$loop: xorl %ecx,%r11d addl %r11d,%eax movl 60(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -583,7 +583,7 @@ L$loop: xorl %ebx,%r11d addl %r11d,%edx movl 24(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -592,7 +592,7 @@ L$loop: xorl %eax,%r11d addl %r11d,%ecx movl 52(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -601,7 +601,7 @@ L$loop: xorl %edx,%r11d addl %r11d,%ebx movl 16(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx @@ -610,7 +610,7 @@ L$loop: xorl %ecx,%r11d addl %r11d,%eax movl 44(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $6,%eax xorl %ecx,%r11d addl %ebx,%eax @@ -619,7 +619,7 @@ L$loop: xorl %ebx,%r11d addl %r11d,%edx movl 8(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $10,%edx xorl %ebx,%r11d addl %eax,%edx @@ -628,7 +628,7 @@ L$loop: xorl %eax,%r11d addl %r11d,%ecx movl 36(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $15,%ecx xorl %eax,%r11d addl %edx,%ecx @@ -637,7 +637,7 @@ L$loop: xorl %edx,%r11d addl %r11d,%ebx movl 0(%rsi),%r10d - movl $4294967295,%r11d + movl $0xffffffff,%r11d roll $21,%ebx xorl %edx,%r11d addl %ecx,%ebx diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/modes/ghash-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/modes/ghash-x86_64.S index 305a91cb60..1072c7fcd3 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/modes/ghash-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/modes/ghash-x86_64.S @@ -22,14 +22,14 @@ L$gmult_prologue: movq $14,%rcx movq 8(%rsi,%rax,1),%r8 movq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl movq %r8,%rdx jmp L$oop1 .p2align 4 L$oop1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 movb (%rdi,%rcx,1),%al shrq $4,%r9 @@ -45,13 +45,13 @@ L$oop1: js L$break1 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 @@ -60,19 +60,19 @@ L$oop1: .p2align 4 L$break1: shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 - andb $240,%bl + andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 shrq $4,%r8 - andq $15,%rdx + andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rbx,1),%r8 @@ -880,20 +880,20 @@ L$_ghash_clmul: movdqu 32(%rsi),%xmm7 .byte 102,65,15,56,0,194 - subq $16,%rcx + subq $0x10,%rcx jz L$odd_tail movdqu 16(%rsi),%xmm6 movl _OPENSSL_ia32cap_P+4(%rip),%eax - cmpq $48,%rcx + cmpq $0x30,%rcx jb L$skip4x andl $71303168,%eax cmpl $4194304,%eax je L$skip4x - subq $48,%rcx - movq $11547335547999543296,%rax + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 @@ -940,7 +940,7 @@ L$_ghash_clmul: xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jc L$tail4x jmp L$mod4_loop @@ -1023,7 +1023,7 @@ L$mod4_loop: xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx - subq $64,%rcx + subq $0x40,%rcx jnc L$mod4_loop L$tail4x: @@ -1067,10 +1067,10 @@ L$tail4x: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 - addq $64,%rcx + addq $0x40,%rcx jz L$done movdqu 32(%rsi),%xmm7 - subq $16,%rcx + subq $0x10,%rcx jz L$odd_tail L$skip4x: @@ -1093,7 +1093,7 @@ L$skip4x: leaq 32(%rdx),%rdx nop - subq $32,%rcx + subq $0x20,%rcx jbe L$even_tail nop jmp L$mod_loop @@ -1156,7 +1156,7 @@ L$mod_loop: .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 - subq $32,%rcx + subq $0x20,%rcx ja L$mod_loop L$even_tail: diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha1-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha1-x86_64.S index 044dc5b7cd..0509d45163 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha1-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha1-x86_64.S @@ -12,6 +12,11 @@ _sha1_block_data_order: movl _OPENSSL_ia32cap_P+8(%rip),%r10d testl $512,%r8d jz L$ialu + andl $268435456,%r8d + andl $1073741824,%r9d + orl %r9d,%r8d + cmpl $1342177280,%r8d + je _avx_shortcut jmp _ssse3_shortcut .p2align 4 @@ -2407,6 +2412,1122 @@ L$done_ssse3: L$epilogue_ssse3: .byte 0xf3,0xc3 + +.p2align 4 +sha1_block_data_order_avx: +_avx_shortcut: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + leaq -64(%rsp),%rsp + vzeroupper + movq %rax,%r14 + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r11 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r11),%xmm6 + vmovdqa -64(%r11),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp L$oop_avx +.p2align 4 +L$oop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r11),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r11),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r11),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je L$done_avx + vmovdqa 64(%r11),%xmm6 + vmovdqa -64(%r11),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp L$oop_avx + +.p2align 4 +L$done_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + leaq (%r14),%rsi + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp +L$epilogue_avx: + .byte 0xf3,0xc3 + .p2align 6 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S index da02d4c2dc..0146ff5cfc 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S @@ -11,6 +11,11 @@ _sha256_block_data_order: movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je L$avx_shortcut testl $512,%r10d jnz L$ssse3_shortcut pushq %rbx @@ -2840,4 +2845,1061 @@ L$ssse3_00_47: L$epilogue_ssse3: .byte 0xf3,0xc3 + +.p2align 6 +sha256_block_data_order_avx: +L$avx_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %r11,64+24(%rsp) +L$prologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp L$loop_avx +.p2align 4 +L$loop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$avx_00_47 + +.p2align 4 +L$avx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne L$avx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_avx + + movq 64+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$epilogue_avx: + .byte 0xf3,0xc3 + #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha512-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha512-x86_64.S index 2f5d912425..aeabd3f43a 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha512-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha512-x86_64.S @@ -7,6 +7,17 @@ .p2align 4 _sha512_block_data_order: + leaq _OPENSSL_ia32cap_P(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $2048,%r10d + jnz L$xop_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je L$avx_shortcut pushq %rbx pushq %rbp pushq %r12 @@ -1783,4 +1794,2234 @@ K512: .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 + +.p2align 6 +sha512_block_data_order_xop: +L$xop_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %r11,128+24(%rsp) +L$prologue_xop: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$loop_xop +.p2align 4 +L$loop_xop: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp L$xop_00_47 + +.p2align 4 +L$xop_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm0,%xmm0 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,223,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm7,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm0,%xmm0 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm1,%xmm1 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,216,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm0,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm1,%xmm1 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm2,%xmm2 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,217,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm1,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm2,%xmm2 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm3,%xmm3 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,218,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm2,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm3,%xmm3 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm4,%xmm4 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,219,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm3,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm4,%xmm4 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm5,%xmm5 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,220,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm4,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm5,%xmm5 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm6,%xmm6 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,221,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm5,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm6,%xmm6 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm7,%xmm7 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,222,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm6,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm7,%xmm7 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne L$xop_00_47 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb L$loop_xop + + movq 128+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$epilogue_xop: + .byte 0xf3,0xc3 + + +.p2align 6 +sha512_block_data_order_avx: +L$avx_shortcut: + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + movq %rsp,%r11 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %r11,128+24(%rsp) +L$prologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$loop_avx +.p2align 4 +L$loop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp L$avx_00_47 + +.p2align 4 +L$avx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne L$avx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb L$loop_avx + + movq 128+24(%rsp),%rsi + vzeroupper + movq (%rsi),%r15 + movq 8(%rsi),%r14 + movq 16(%rsi),%r13 + movq 24(%rsi),%r12 + movq 32(%rsi),%rbp + movq 40(%rsi),%rbx + leaq 48(%rsi),%rsp +L$epilogue_avx: + .byte 0xf3,0xc3 + #endif diff --git a/packager/third_party/boringssl/roll_boringssl.py b/packager/third_party/boringssl/roll_boringssl.py new file mode 100755 index 0000000000..13fe24c775 --- /dev/null +++ b/packager/third_party/boringssl/roll_boringssl.py @@ -0,0 +1,126 @@ +#!/usr/bin/env python +# Copyright 2015 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +"""Rolls third_party/boringssl/src in DEPS and updates generated build files.""" + +import os +import os.path +import shutil +import subprocess +import sys + + +SCRIPT_PATH = os.path.abspath(__file__) +SRC_PATH = os.path.dirname(os.path.dirname(os.path.dirname(SCRIPT_PATH))) +DEPS_PATH = os.path.join(SRC_PATH, 'DEPS') +BORINGSSL_PATH = os.path.join(SRC_PATH, 'third_party', 'boringssl') +BORINGSSL_SRC_PATH = os.path.join(BORINGSSL_PATH, 'src') + +if not os.path.isfile(DEPS_PATH) or not os.path.isdir(BORINGSSL_SRC_PATH): + raise Exception('Could not find Chromium checkout') + +# Pull OS_ARCH_COMBOS out of the BoringSSL script. +sys.path.append(os.path.join(BORINGSSL_SRC_PATH, 'util')) +import generate_build_files + +GENERATED_FILES = [ + 'BUILD.generated.gni', + 'BUILD.generated_tests.gni', + 'boringssl.gypi', + 'boringssl_tests.gypi', + 'err_data.c', +] + + +def IsPristine(repo): + """Returns True if a git checkout is pristine.""" + cmd = ['git', 'diff', '--ignore-submodules'] + return not (subprocess.check_output(cmd, cwd=repo).strip() or + subprocess.check_output(cmd + ['--cached'], cwd=repo).strip()) + + +def RevParse(repo, rev): + """Resolves a string to a git commit.""" + return subprocess.check_output(['git', 'rev-parse', rev], cwd=repo).strip() + + +def UpdateDEPS(deps, from_hash, to_hash): + """Updates all references of |from_hash| to |to_hash| in |deps|.""" + with open(deps, 'rb') as f: + contents = f.read() + if from_hash not in contents: + raise Exception('%s not in DEPS' % from_hash) + contents = contents.replace(from_hash, to_hash) + with open(deps, 'wb') as f: + f.write(contents) + + +def main(): + if len(sys.argv) > 2: + sys.stderr.write('Usage: %s [COMMIT]' % sys.argv[0]) + return 1 + + if not IsPristine(SRC_PATH): + print >>sys.stderr, 'Chromium checkout not pristine.' + return 0 + if not IsPristine(BORINGSSL_SRC_PATH): + print >>sys.stderr, 'BoringSSL checkout not pristine.' + return 0 + + if len(sys.argv) > 1: + commit = RevParse(BORINGSSL_SRC_PATH, sys.argv[1]) + else: + subprocess.check_call(['git', 'fetch', 'origin'], cwd=BORINGSSL_SRC_PATH) + commit = RevParse(BORINGSSL_SRC_PATH, 'origin/master') + + head = RevParse(BORINGSSL_SRC_PATH, 'HEAD') + if head == commit: + print 'BoringSSL already up to date.' + return 0 + + print 'Rolling BoringSSL from %s to %s...' % (head, commit) + + UpdateDEPS(DEPS_PATH, head, commit) + + # Checkout third_party/boringssl/src to generate new files. + subprocess.check_call(['git', 'checkout', commit], cwd=BORINGSSL_SRC_PATH) + + # Clear the old generated files. + for (osname, arch, _, _, _) in generate_build_files.OS_ARCH_COMBOS: + path = os.path.join(BORINGSSL_PATH, osname + '-' + arch) + shutil.rmtree(path) + for file in GENERATED_FILES: + path = os.path.join(BORINGSSL_PATH, file) + os.unlink(path) + + # Generate new ones. + subprocess.check_call(['python', + os.path.join(BORINGSSL_SRC_PATH, 'util', + 'generate_build_files.py'), + 'gn', 'gyp'], + cwd=BORINGSSL_PATH) + + # Commit everything. + subprocess.check_call(['git', 'add', DEPS_PATH], cwd=SRC_PATH) + for (osname, arch, _, _, _) in generate_build_files.OS_ARCH_COMBOS: + path = os.path.join(BORINGSSL_PATH, osname + '-' + arch) + subprocess.check_call(['git', 'add', path], cwd=SRC_PATH) + for file in GENERATED_FILES: + path = os.path.join(BORINGSSL_PATH, file) + subprocess.check_call(['git', 'add', path], cwd=SRC_PATH) + + message = """Roll src/third_party/boringssl/src %s..%s + +https://boringssl.googlesource.com/boringssl/+log/%s..%s + +BUG=none +""" % (head[:9], commit[:9], head, commit) + subprocess.check_call(['git', 'commit', '-m', message], cwd=SRC_PATH) + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm b/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm new file mode 100644 index 0000000000..3ba31a2b35 --- /dev/null +++ b/packager/third_party/boringssl/win-x86/crypto/chacha/chacha-x86.asm @@ -0,0 +1,977 @@ +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +%ifdef __YASM_VERSION_ID__ +%if __YASM_VERSION_ID__ < 01010000h +%error yasm version 1.1.0 or later needed. +%endif +; Yasm automatically includes .00 and complains about redefining it. +; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html +%else +$@feat.00 equ 1 +%endif +section .text code align=64 +%else +section .text code +%endif +global _ChaCha20_ctr32 +align 16 +_ChaCha20_ctr32: +L$_ChaCha20_ctr32_begin: + push ebp + push ebx + push esi + push edi + xor eax,eax + cmp eax,DWORD [28+esp] + je NEAR L$000no_data + call L$pic_point +L$pic_point: + pop eax + lea ebp,[_OPENSSL_ia32cap_P] + test DWORD [ebp],16777216 + jz NEAR L$001x86 + test DWORD [4+ebp],512 + jz NEAR L$001x86 + jmp NEAR L$ssse3_shortcut +L$001x86: + mov esi,DWORD [32+esp] + mov edi,DWORD [36+esp] + sub esp,132 + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edx,DWORD [12+esi] + mov DWORD [80+esp],eax + mov DWORD [84+esp],ebx + mov DWORD [88+esp],ecx + mov DWORD [92+esp],edx + mov eax,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov edx,DWORD [28+esi] + mov DWORD [96+esp],eax + mov DWORD [100+esp],ebx + mov DWORD [104+esp],ecx + mov DWORD [108+esp],edx + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + mov edx,DWORD [12+edi] + sub eax,1 + mov DWORD [112+esp],eax + mov DWORD [116+esp],ebx + mov DWORD [120+esp],ecx + mov DWORD [124+esp],edx + jmp NEAR L$002entry +align 16 +L$003outer_loop: + mov DWORD [156+esp],ebx + mov DWORD [152+esp],eax + mov DWORD [160+esp],ecx +L$002entry: + mov eax,1634760805 + mov DWORD [4+esp],857760878 + mov DWORD [8+esp],2036477234 + mov DWORD [12+esp],1797285236 + mov ebx,DWORD [84+esp] + mov ebp,DWORD [88+esp] + mov ecx,DWORD [104+esp] + mov esi,DWORD [108+esp] + mov edx,DWORD [116+esp] + mov edi,DWORD [120+esp] + mov DWORD [20+esp],ebx + mov DWORD [24+esp],ebp + mov DWORD [40+esp],ecx + mov DWORD [44+esp],esi + mov DWORD [52+esp],edx + mov DWORD [56+esp],edi + mov ebx,DWORD [92+esp] + mov edi,DWORD [124+esp] + mov edx,DWORD [112+esp] + mov ebp,DWORD [80+esp] + mov ecx,DWORD [96+esp] + mov esi,DWORD [100+esp] + add edx,1 + mov DWORD [28+esp],ebx + mov DWORD [60+esp],edi + mov DWORD [112+esp],edx + mov ebx,10 + jmp NEAR L$004loop +align 16 +L$004loop: + add eax,ebp + mov DWORD [128+esp],ebx + mov ebx,ebp + xor edx,eax + rol edx,16 + add ecx,edx + xor ebx,ecx + mov edi,DWORD [52+esp] + rol ebx,12 + mov ebp,DWORD [20+esp] + add eax,ebx + xor edx,eax + mov DWORD [esp],eax + rol edx,8 + mov eax,DWORD [4+esp] + add ecx,edx + mov DWORD [48+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + mov DWORD [32+esp],ecx + rol edi,16 + mov DWORD [16+esp],ebx + add esi,edi + mov ecx,DWORD [40+esp] + xor ebp,esi + mov edx,DWORD [56+esp] + rol ebp,12 + mov ebx,DWORD [24+esp] + add eax,ebp + xor edi,eax + mov DWORD [4+esp],eax + rol edi,8 + mov eax,DWORD [8+esp] + add esi,edi + mov DWORD [52+esp],edi + xor ebp,esi + add eax,ebx + rol ebp,7 + xor edx,eax + mov DWORD [36+esp],esi + rol edx,16 + mov DWORD [20+esp],ebp + add ecx,edx + mov esi,DWORD [44+esp] + xor ebx,ecx + mov edi,DWORD [60+esp] + rol ebx,12 + mov ebp,DWORD [28+esp] + add eax,ebx + xor edx,eax + mov DWORD [8+esp],eax + rol edx,8 + mov eax,DWORD [12+esp] + add ecx,edx + mov DWORD [56+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + rol edi,16 + mov DWORD [24+esp],ebx + add esi,edi + xor ebp,esi + rol ebp,12 + mov ebx,DWORD [20+esp] + add eax,ebp + xor edi,eax + mov DWORD [12+esp],eax + rol edi,8 + mov eax,DWORD [esp] + add esi,edi + mov edx,edi + xor ebp,esi + add eax,ebx + rol ebp,7 + xor edx,eax + rol edx,16 + mov DWORD [28+esp],ebp + add ecx,edx + xor ebx,ecx + mov edi,DWORD [48+esp] + rol ebx,12 + mov ebp,DWORD [24+esp] + add eax,ebx + xor edx,eax + mov DWORD [esp],eax + rol edx,8 + mov eax,DWORD [4+esp] + add ecx,edx + mov DWORD [60+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + mov DWORD [40+esp],ecx + rol edi,16 + mov DWORD [20+esp],ebx + add esi,edi + mov ecx,DWORD [32+esp] + xor ebp,esi + mov edx,DWORD [52+esp] + rol ebp,12 + mov ebx,DWORD [28+esp] + add eax,ebp + xor edi,eax + mov DWORD [4+esp],eax + rol edi,8 + mov eax,DWORD [8+esp] + add esi,edi + mov DWORD [48+esp],edi + xor ebp,esi + add eax,ebx + rol ebp,7 + xor edx,eax + mov DWORD [44+esp],esi + rol edx,16 + mov DWORD [24+esp],ebp + add ecx,edx + mov esi,DWORD [36+esp] + xor ebx,ecx + mov edi,DWORD [56+esp] + rol ebx,12 + mov ebp,DWORD [16+esp] + add eax,ebx + xor edx,eax + mov DWORD [8+esp],eax + rol edx,8 + mov eax,DWORD [12+esp] + add ecx,edx + mov DWORD [52+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + rol edi,16 + mov DWORD [28+esp],ebx + add esi,edi + xor ebp,esi + mov edx,DWORD [48+esp] + rol ebp,12 + mov ebx,DWORD [128+esp] + add eax,ebp + xor edi,eax + mov DWORD [12+esp],eax + rol edi,8 + mov eax,DWORD [esp] + add esi,edi + mov DWORD [56+esp],edi + xor ebp,esi + rol ebp,7 + dec ebx + jnz NEAR L$004loop + mov ebx,DWORD [160+esp] + add eax,1634760805 + add ebp,DWORD [80+esp] + add ecx,DWORD [96+esp] + add esi,DWORD [100+esp] + cmp ebx,64 + jb NEAR L$005tail + mov ebx,DWORD [156+esp] + add edx,DWORD [112+esp] + add edi,DWORD [120+esp] + xor eax,DWORD [ebx] + xor ebp,DWORD [16+ebx] + mov DWORD [esp],eax + mov eax,DWORD [152+esp] + xor ecx,DWORD [32+ebx] + xor esi,DWORD [36+ebx] + xor edx,DWORD [48+ebx] + xor edi,DWORD [56+ebx] + mov DWORD [16+eax],ebp + mov DWORD [32+eax],ecx + mov DWORD [36+eax],esi + mov DWORD [48+eax],edx + mov DWORD [56+eax],edi + mov ebp,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov esi,DWORD [12+esp] + mov edx,DWORD [20+esp] + mov edi,DWORD [24+esp] + add ebp,857760878 + add ecx,2036477234 + add esi,1797285236 + add edx,DWORD [84+esp] + add edi,DWORD [88+esp] + xor ebp,DWORD [4+ebx] + xor ecx,DWORD [8+ebx] + xor esi,DWORD [12+ebx] + xor edx,DWORD [20+ebx] + xor edi,DWORD [24+ebx] + mov DWORD [4+eax],ebp + mov DWORD [8+eax],ecx + mov DWORD [12+eax],esi + mov DWORD [20+eax],edx + mov DWORD [24+eax],edi + mov ebp,DWORD [28+esp] + mov ecx,DWORD [40+esp] + mov esi,DWORD [44+esp] + mov edx,DWORD [52+esp] + mov edi,DWORD [60+esp] + add ebp,DWORD [92+esp] + add ecx,DWORD [104+esp] + add esi,DWORD [108+esp] + add edx,DWORD [116+esp] + add edi,DWORD [124+esp] + xor ebp,DWORD [28+ebx] + xor ecx,DWORD [40+ebx] + xor esi,DWORD [44+ebx] + xor edx,DWORD [52+ebx] + xor edi,DWORD [60+ebx] + lea ebx,[64+ebx] + mov DWORD [28+eax],ebp + mov ebp,DWORD [esp] + mov DWORD [40+eax],ecx + mov ecx,DWORD [160+esp] + mov DWORD [44+eax],esi + mov DWORD [52+eax],edx + mov DWORD [60+eax],edi + mov DWORD [eax],ebp + lea eax,[64+eax] + sub ecx,64 + jnz NEAR L$003outer_loop + jmp NEAR L$006done +L$005tail: + add edx,DWORD [112+esp] + add edi,DWORD [120+esp] + mov DWORD [esp],eax + mov DWORD [16+esp],ebp + mov DWORD [32+esp],ecx + mov DWORD [36+esp],esi + mov DWORD [48+esp],edx + mov DWORD [56+esp],edi + mov ebp,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov esi,DWORD [12+esp] + mov edx,DWORD [20+esp] + mov edi,DWORD [24+esp] + add ebp,857760878 + add ecx,2036477234 + add esi,1797285236 + add edx,DWORD [84+esp] + add edi,DWORD [88+esp] + mov DWORD [4+esp],ebp + mov DWORD [8+esp],ecx + mov DWORD [12+esp],esi + mov DWORD [20+esp],edx + mov DWORD [24+esp],edi + mov ebp,DWORD [28+esp] + mov ecx,DWORD [40+esp] + mov esi,DWORD [44+esp] + mov edx,DWORD [52+esp] + mov edi,DWORD [60+esp] + add ebp,DWORD [92+esp] + add ecx,DWORD [104+esp] + add esi,DWORD [108+esp] + add edx,DWORD [116+esp] + add edi,DWORD [124+esp] + mov DWORD [28+esp],ebp + mov ebp,DWORD [156+esp] + mov DWORD [40+esp],ecx + mov ecx,DWORD [152+esp] + mov DWORD [44+esp],esi + xor esi,esi + mov DWORD [52+esp],edx + mov DWORD [60+esp],edi + xor eax,eax + xor edx,edx +L$007tail_loop: + mov al,BYTE [ebp*1+esi] + mov dl,BYTE [esi*1+esp] + lea esi,[1+esi] + xor al,dl + mov BYTE [esi*1+ecx-1],al + dec ebx + jnz NEAR L$007tail_loop +L$006done: + add esp,132 +L$000no_data: + pop edi + pop esi + pop ebx + pop ebp + ret +global _ChaCha20_ssse3 +align 16 +_ChaCha20_ssse3: +L$_ChaCha20_ssse3_begin: + push ebp + push ebx + push esi + push edi +L$ssse3_shortcut: + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + mov ecx,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebx,DWORD [36+esp] + mov ebp,esp + sub esp,524 + and esp,-64 + mov DWORD [512+esp],ebp + lea eax,[(L$ssse3_data-L$pic_point)+eax] + movdqu xmm3,[ebx] + cmp ecx,256 + jb NEAR L$0081x + mov DWORD [516+esp],edx + mov DWORD [520+esp],ebx + sub ecx,256 + lea ebp,[384+esp] + movdqu xmm7,[edx] + pshufd xmm0,xmm3,0 + pshufd xmm1,xmm3,85 + pshufd xmm2,xmm3,170 + pshufd xmm3,xmm3,255 + paddd xmm0,[48+eax] + pshufd xmm4,xmm7,0 + pshufd xmm5,xmm7,85 + psubd xmm0,[64+eax] + pshufd xmm6,xmm7,170 + pshufd xmm7,xmm7,255 + movdqa [64+ebp],xmm0 + movdqa [80+ebp],xmm1 + movdqa [96+ebp],xmm2 + movdqa [112+ebp],xmm3 + movdqu xmm3,[16+edx] + movdqa [ebp-64],xmm4 + movdqa [ebp-48],xmm5 + movdqa [ebp-32],xmm6 + movdqa [ebp-16],xmm7 + movdqa xmm7,[32+eax] + lea ebx,[128+esp] + pshufd xmm0,xmm3,0 + pshufd xmm1,xmm3,85 + pshufd xmm2,xmm3,170 + pshufd xmm3,xmm3,255 + pshufd xmm4,xmm7,0 + pshufd xmm5,xmm7,85 + pshufd xmm6,xmm7,170 + pshufd xmm7,xmm7,255 + movdqa [ebp],xmm0 + movdqa [16+ebp],xmm1 + movdqa [32+ebp],xmm2 + movdqa [48+ebp],xmm3 + movdqa [ebp-128],xmm4 + movdqa [ebp-112],xmm5 + movdqa [ebp-96],xmm6 + movdqa [ebp-80],xmm7 + lea esi,[128+esi] + lea edi,[128+edi] + jmp NEAR L$009outer_loop +align 16 +L$009outer_loop: + movdqa xmm1,[ebp-112] + movdqa xmm2,[ebp-96] + movdqa xmm3,[ebp-80] + movdqa xmm5,[ebp-48] + movdqa xmm6,[ebp-32] + movdqa xmm7,[ebp-16] + movdqa [ebx-112],xmm1 + movdqa [ebx-96],xmm2 + movdqa [ebx-80],xmm3 + movdqa [ebx-48],xmm5 + movdqa [ebx-32],xmm6 + movdqa [ebx-16],xmm7 + movdqa xmm2,[32+ebp] + movdqa xmm3,[48+ebp] + movdqa xmm4,[64+ebp] + movdqa xmm5,[80+ebp] + movdqa xmm6,[96+ebp] + movdqa xmm7,[112+ebp] + paddd xmm4,[64+eax] + movdqa [32+ebx],xmm2 + movdqa [48+ebx],xmm3 + movdqa [64+ebx],xmm4 + movdqa [80+ebx],xmm5 + movdqa [96+ebx],xmm6 + movdqa [112+ebx],xmm7 + movdqa [64+ebp],xmm4 + movdqa xmm0,[ebp-128] + movdqa xmm6,xmm4 + movdqa xmm3,[ebp-64] + movdqa xmm4,[ebp] + movdqa xmm5,[16+ebp] + mov edx,10 + nop +align 16 +L$010loop: + paddd xmm0,xmm3 + movdqa xmm2,xmm3 + pxor xmm6,xmm0 + pshufb xmm6,[eax] + paddd xmm4,xmm6 + pxor xmm2,xmm4 + movdqa xmm3,[ebx-48] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-112] + paddd xmm0,xmm2 + movdqa xmm7,[80+ebx] + pxor xmm6,xmm0 + movdqa [ebx-128],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [64+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + movdqa [ebx],xmm4 + pshufb xmm7,[eax] + movdqa [ebx-64],xmm2 + paddd xmm5,xmm7 + movdqa xmm4,[32+ebx] + pxor xmm3,xmm5 + movdqa xmm2,[ebx-32] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-96] + paddd xmm1,xmm3 + movdqa xmm6,[96+ebx] + pxor xmm7,xmm1 + movdqa [ebx-112],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [80+ebx],xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + movdqa [16+ebx],xmm5 + pshufb xmm6,[eax] + movdqa [ebx-48],xmm3 + paddd xmm4,xmm6 + movdqa xmm5,[48+ebx] + pxor xmm2,xmm4 + movdqa xmm3,[ebx-16] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-80] + paddd xmm0,xmm2 + movdqa xmm7,[112+ebx] + pxor xmm6,xmm0 + movdqa [ebx-96],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [96+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + pshufb xmm7,[eax] + movdqa [ebx-32],xmm2 + paddd xmm5,xmm7 + pxor xmm3,xmm5 + movdqa xmm2,[ebx-48] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-128] + paddd xmm1,xmm3 + pxor xmm7,xmm1 + movdqa [ebx-80],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa xmm6,xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + pshufb xmm6,[eax] + movdqa [ebx-16],xmm3 + paddd xmm4,xmm6 + pxor xmm2,xmm4 + movdqa xmm3,[ebx-32] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-112] + paddd xmm0,xmm2 + movdqa xmm7,[64+ebx] + pxor xmm6,xmm0 + movdqa [ebx-128],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [112+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + movdqa [32+ebx],xmm4 + pshufb xmm7,[eax] + movdqa [ebx-48],xmm2 + paddd xmm5,xmm7 + movdqa xmm4,[ebx] + pxor xmm3,xmm5 + movdqa xmm2,[ebx-16] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-96] + paddd xmm1,xmm3 + movdqa xmm6,[80+ebx] + pxor xmm7,xmm1 + movdqa [ebx-112],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [64+ebx],xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + movdqa [48+ebx],xmm5 + pshufb xmm6,[eax] + movdqa [ebx-32],xmm3 + paddd xmm4,xmm6 + movdqa xmm5,[16+ebx] + pxor xmm2,xmm4 + movdqa xmm3,[ebx-64] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-80] + paddd xmm0,xmm2 + movdqa xmm7,[96+ebx] + pxor xmm6,xmm0 + movdqa [ebx-96],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [80+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + pshufb xmm7,[eax] + movdqa [ebx-16],xmm2 + paddd xmm5,xmm7 + pxor xmm3,xmm5 + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-128] + paddd xmm1,xmm3 + movdqa xmm6,[64+ebx] + pxor xmm7,xmm1 + movdqa [ebx-80],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [96+ebx],xmm7 + pxor xmm3,xmm5 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + por xmm3,xmm1 + dec edx + jnz NEAR L$010loop + movdqa [ebx-64],xmm3 + movdqa [ebx],xmm4 + movdqa [16+ebx],xmm5 + movdqa [64+ebx],xmm6 + movdqa [96+ebx],xmm7 + movdqa xmm1,[ebx-112] + movdqa xmm2,[ebx-96] + movdqa xmm3,[ebx-80] + paddd xmm0,[ebp-128] + paddd xmm1,[ebp-112] + paddd xmm2,[ebp-96] + paddd xmm3,[ebp-80] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[ebx-64] + pxor xmm5,xmm1 + movdqa xmm1,[ebx-48] + pxor xmm6,xmm2 + movdqa xmm2,[ebx-32] + pxor xmm7,xmm3 + movdqa xmm3,[ebx-16] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[ebp-64] + paddd xmm1,[ebp-48] + paddd xmm2,[ebp-32] + paddd xmm3,[ebp-16] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[ebx] + pxor xmm5,xmm1 + movdqa xmm1,[16+ebx] + pxor xmm6,xmm2 + movdqa xmm2,[32+ebx] + pxor xmm7,xmm3 + movdqa xmm3,[48+ebx] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[ebp] + paddd xmm1,[16+ebp] + paddd xmm2,[32+ebp] + paddd xmm3,[48+ebp] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[64+ebx] + pxor xmm5,xmm1 + movdqa xmm1,[80+ebx] + pxor xmm6,xmm2 + movdqa xmm2,[96+ebx] + pxor xmm7,xmm3 + movdqa xmm3,[112+ebx] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[64+ebp] + paddd xmm1,[80+ebp] + paddd xmm2,[96+ebp] + paddd xmm3,[112+ebp] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[208+esi] + pxor xmm4,xmm0 + pxor xmm5,xmm1 + pxor xmm6,xmm2 + pxor xmm7,xmm3 + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[208+edi] + sub ecx,256 + jnc NEAR L$009outer_loop + add ecx,256 + jz NEAR L$011done + mov ebx,DWORD [520+esp] + lea esi,[esi-128] + mov edx,DWORD [516+esp] + lea edi,[edi-128] + movd xmm2,DWORD [64+ebp] + movdqu xmm3,[ebx] + paddd xmm2,[96+eax] + pand xmm3,[112+eax] + por xmm3,xmm2 +L$0081x: + movdqa xmm0,[32+eax] + movdqu xmm1,[edx] + movdqu xmm2,[16+edx] + movdqa xmm6,[eax] + movdqa xmm7,[16+eax] + mov DWORD [48+esp],ebp + movdqa [esp],xmm0 + movdqa [16+esp],xmm1 + movdqa [32+esp],xmm2 + movdqa [48+esp],xmm3 + mov edx,10 + jmp NEAR L$012loop1x +align 16 +L$013outer1x: + movdqa xmm3,[80+eax] + movdqa xmm0,[esp] + movdqa xmm1,[16+esp] + movdqa xmm2,[32+esp] + paddd xmm3,[48+esp] + mov edx,10 + movdqa [48+esp],xmm3 + jmp NEAR L$012loop1x +align 16 +L$012loop1x: + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,57 + pshufd xmm3,xmm3,147 + nop + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +db 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,147 + pshufd xmm3,xmm3,57 + dec edx + jnz NEAR L$012loop1x + paddd xmm0,[esp] + paddd xmm1,[16+esp] + paddd xmm2,[32+esp] + paddd xmm3,[48+esp] + cmp ecx,64 + jb NEAR L$014tail + movdqu xmm4,[esi] + movdqu xmm5,[16+esi] + pxor xmm0,xmm4 + movdqu xmm4,[32+esi] + pxor xmm1,xmm5 + movdqu xmm5,[48+esi] + pxor xmm2,xmm4 + pxor xmm3,xmm5 + lea esi,[64+esi] + movdqu [edi],xmm0 + movdqu [16+edi],xmm1 + movdqu [32+edi],xmm2 + movdqu [48+edi],xmm3 + lea edi,[64+edi] + sub ecx,64 + jnz NEAR L$013outer1x + jmp NEAR L$011done +L$014tail: + movdqa [esp],xmm0 + movdqa [16+esp],xmm1 + movdqa [32+esp],xmm2 + movdqa [48+esp],xmm3 + xor eax,eax + xor edx,edx + xor ebp,ebp +L$015tail_loop: + mov al,BYTE [ebp*1+esp] + mov dl,BYTE [ebp*1+esi] + lea ebp,[1+ebp] + xor al,dl + mov BYTE [ebp*1+edi-1],al + dec ecx + jnz NEAR L$015tail_loop +L$011done: + mov esp,DWORD [512+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +align 64 +L$ssse3_data: +db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +db 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +dd 1634760805,857760878,2036477234,1797285236 +dd 0,1,2,3 +dd 4,4,4,4 +dd 1,0,0,0 +dd 4,0,0,0 +dd 0,-1,-1,-1 +align 64 +db 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +db 114,103,62,0 +segment .bss +common _OPENSSL_ia32cap_P 16 diff --git a/packager/third_party/boringssl/win-x86/crypto/rc4/rc4-586.asm b/packager/third_party/boringssl/win-x86/crypto/rc4/rc4-586.asm index 08cd9f6d70..0bab2bec85 100644 --- a/packager/third_party/boringssl/win-x86/crypto/rc4/rc4-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/rc4/rc4-586.asm @@ -349,34 +349,5 @@ L$013exit: pop ebx pop ebp ret -global _RC4_options -align 16 -_RC4_options: -L$_RC4_options_begin: - call L$016pic_point -L$016pic_point: - pop eax - lea eax,[(L$017opts-L$016pic_point)+eax] - lea edx,[_OPENSSL_ia32cap_P] - mov edx,DWORD [edx] - bt edx,20 - jc NEAR L$0181xchar - bt edx,26 - jnc NEAR L$019ret - add eax,25 - ret -L$0181xchar: - add eax,12 -L$019ret: - ret -align 64 -L$017opts: -db 114,99,52,40,52,120,44,105,110,116,41,0 -db 114,99,52,40,49,120,44,99,104,97,114,41,0 -db 114,99,52,40,56,120,44,109,109,120,41,0 -db 82,67,52,32,102,111,114,32,120,56,54,44,32,67,82,89 -db 80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114 -db 111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -align 64 segment .bss common _OPENSSL_ia32cap_P 16 diff --git a/packager/third_party/boringssl/win-x86/crypto/sha/sha1-586.asm b/packager/third_party/boringssl/win-x86/crypto/sha/sha1-586.asm index e24449d200..cee8c62626 100644 --- a/packager/third_party/boringssl/win-x86/crypto/sha/sha1-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/sha/sha1-586.asm @@ -35,8 +35,11 @@ L$000pic_point: mov ecx,DWORD [8+esi] test eax,16777216 jz NEAR L$001x86 - test ecx,536870912 - jnz NEAR L$shaext_shortcut + and edx,268435456 + and eax,1073741824 + or eax,edx + cmp eax,1342177280 + je NEAR L$avx_shortcut jmp NEAR L$ssse3_shortcut align 16 L$001x86: @@ -1405,7 +1408,7 @@ L$002loop: pop ebp ret align 16 -__sha1_block_data_order_shaext: +__sha1_block_data_order_ssse3: push ebp push ebx push esi @@ -1414,174 +1417,6 @@ __sha1_block_data_order_shaext: L$003pic_point: pop ebp lea ebp,[(L$K_XX_XX-L$003pic_point)+ebp] -L$shaext_shortcut: - mov edi,DWORD [20+esp] - mov ebx,esp - mov esi,DWORD [24+esp] - mov ecx,DWORD [28+esp] - sub esp,32 - movdqu xmm0,[edi] - movd xmm1,DWORD [16+edi] - and esp,-32 - movdqa xmm3,[80+ebp] - movdqu xmm4,[esi] - pshufd xmm0,xmm0,27 - movdqu xmm5,[16+esi] - pshufd xmm1,xmm1,27 - movdqu xmm6,[32+esi] -db 102,15,56,0,227 - movdqu xmm7,[48+esi] -db 102,15,56,0,235 -db 102,15,56,0,243 -db 102,15,56,0,251 - jmp NEAR L$004loop_shaext -align 16 -L$004loop_shaext: - dec ecx - lea eax,[64+esi] - movdqa [esp],xmm1 - paddd xmm1,xmm4 - cmovne esi,eax - movdqa [16+esp],xmm0 -db 15,56,201,229 - movdqa xmm2,xmm0 -db 15,58,204,193,0 -db 15,56,200,213 - pxor xmm4,xmm6 -db 15,56,201,238 -db 15,56,202,231 - movdqa xmm1,xmm0 -db 15,58,204,194,0 -db 15,56,200,206 - pxor xmm5,xmm7 -db 15,56,202,236 -db 15,56,201,247 - movdqa xmm2,xmm0 -db 15,58,204,193,0 -db 15,56,200,215 - pxor xmm6,xmm4 -db 15,56,201,252 -db 15,56,202,245 - movdqa xmm1,xmm0 -db 15,58,204,194,0 -db 15,56,200,204 - pxor xmm7,xmm5 -db 15,56,202,254 -db 15,56,201,229 - movdqa xmm2,xmm0 -db 15,58,204,193,0 -db 15,56,200,213 - pxor xmm4,xmm6 -db 15,56,201,238 -db 15,56,202,231 - movdqa xmm1,xmm0 -db 15,58,204,194,1 -db 15,56,200,206 - pxor xmm5,xmm7 -db 15,56,202,236 -db 15,56,201,247 - movdqa xmm2,xmm0 -db 15,58,204,193,1 -db 15,56,200,215 - pxor xmm6,xmm4 -db 15,56,201,252 -db 15,56,202,245 - movdqa xmm1,xmm0 -db 15,58,204,194,1 -db 15,56,200,204 - pxor xmm7,xmm5 -db 15,56,202,254 -db 15,56,201,229 - movdqa xmm2,xmm0 -db 15,58,204,193,1 -db 15,56,200,213 - pxor xmm4,xmm6 -db 15,56,201,238 -db 15,56,202,231 - movdqa xmm1,xmm0 -db 15,58,204,194,1 -db 15,56,200,206 - pxor xmm5,xmm7 -db 15,56,202,236 -db 15,56,201,247 - movdqa xmm2,xmm0 -db 15,58,204,193,2 -db 15,56,200,215 - pxor xmm6,xmm4 -db 15,56,201,252 -db 15,56,202,245 - movdqa xmm1,xmm0 -db 15,58,204,194,2 -db 15,56,200,204 - pxor xmm7,xmm5 -db 15,56,202,254 -db 15,56,201,229 - movdqa xmm2,xmm0 -db 15,58,204,193,2 -db 15,56,200,213 - pxor xmm4,xmm6 -db 15,56,201,238 -db 15,56,202,231 - movdqa xmm1,xmm0 -db 15,58,204,194,2 -db 15,56,200,206 - pxor xmm5,xmm7 -db 15,56,202,236 -db 15,56,201,247 - movdqa xmm2,xmm0 -db 15,58,204,193,2 -db 15,56,200,215 - pxor xmm6,xmm4 -db 15,56,201,252 -db 15,56,202,245 - movdqa xmm1,xmm0 -db 15,58,204,194,3 -db 15,56,200,204 - pxor xmm7,xmm5 -db 15,56,202,254 - movdqu xmm4,[esi] - movdqa xmm2,xmm0 -db 15,58,204,193,3 -db 15,56,200,213 - movdqu xmm5,[16+esi] -db 102,15,56,0,227 - movdqa xmm1,xmm0 -db 15,58,204,194,3 -db 15,56,200,206 - movdqu xmm6,[32+esi] -db 102,15,56,0,235 - movdqa xmm2,xmm0 -db 15,58,204,193,3 -db 15,56,200,215 - movdqu xmm7,[48+esi] -db 102,15,56,0,243 - movdqa xmm1,xmm0 -db 15,58,204,194,3 - movdqa xmm2,[esp] -db 102,15,56,0,251 -db 15,56,200,202 - paddd xmm0,[16+esp] - jnz NEAR L$004loop_shaext - pshufd xmm0,xmm0,27 - pshufd xmm1,xmm1,27 - movdqu [edi],xmm0 - movd DWORD [16+edi],xmm1 - mov esp,ebx - pop edi - pop esi - pop ebx - pop ebp - ret -align 16 -__sha1_block_data_order_ssse3: - push ebp - push ebx - push esi - push edi - call L$005pic_point -L$005pic_point: - pop ebp - lea ebp,[(L$K_XX_XX-L$005pic_point)+ebp] L$ssse3_shortcut: movdqa xmm7,[ebp] movdqa xmm0,[16+ebp] @@ -1634,9 +1469,9 @@ db 102,15,56,0,222 xor ebp,edx pshufd xmm4,xmm0,238 and esi,ebp - jmp NEAR L$006loop + jmp NEAR L$004loop align 16 -L$006loop: +L$004loop: ror ebx,2 xor esi,edx mov ebp,eax @@ -2539,7 +2374,7 @@ L$006loop: add ecx,edx mov ebp,DWORD [196+esp] cmp ebp,DWORD [200+esp] - je NEAR L$007done + je NEAR L$005done movdqa xmm7,[160+esp] movdqa xmm6,[176+esp] movdqu xmm0,[ebp] @@ -2674,9 +2509,9 @@ db 102,15,56,0,222 pshufd xmm4,xmm0,238 and esi,ebx mov ebx,ebp - jmp NEAR L$006loop + jmp NEAR L$004loop align 16 -L$007done: +L$005done: add ebx,DWORD [16+esp] xor esi,edi mov ebp,ecx @@ -2789,6 +2624,1174 @@ L$007done: pop ebx pop ebp ret +align 16 +__sha1_block_data_order_avx: + push ebp + push ebx + push esi + push edi + call L$006pic_point +L$006pic_point: + pop ebp + lea ebp,[(L$K_XX_XX-L$006pic_point)+ebp] +L$avx_shortcut: + vzeroall + vmovdqa xmm7,[ebp] + vmovdqa xmm0,[16+ebp] + vmovdqa xmm1,[32+ebp] + vmovdqa xmm2,[48+ebp] + vmovdqa xmm6,[64+ebp] + mov edi,DWORD [20+esp] + mov ebp,DWORD [24+esp] + mov edx,DWORD [28+esp] + mov esi,esp + sub esp,208 + and esp,-64 + vmovdqa [112+esp],xmm0 + vmovdqa [128+esp],xmm1 + vmovdqa [144+esp],xmm2 + shl edx,6 + vmovdqa [160+esp],xmm7 + add edx,ebp + vmovdqa [176+esp],xmm6 + add ebp,64 + mov DWORD [192+esp],edi + mov DWORD [196+esp],ebp + mov DWORD [200+esp],edx + mov DWORD [204+esp],esi + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + mov edx,DWORD [12+edi] + mov edi,DWORD [16+edi] + mov esi,ebx + vmovdqu xmm0,[ebp-64] + vmovdqu xmm1,[ebp-48] + vmovdqu xmm2,[ebp-32] + vmovdqu xmm3,[ebp-16] + vpshufb xmm0,xmm0,xmm6 + vpshufb xmm1,xmm1,xmm6 + vpshufb xmm2,xmm2,xmm6 + vmovdqa [96+esp],xmm7 + vpshufb xmm3,xmm3,xmm6 + vpaddd xmm4,xmm0,xmm7 + vpaddd xmm5,xmm1,xmm7 + vpaddd xmm6,xmm2,xmm7 + vmovdqa [esp],xmm4 + mov ebp,ecx + vmovdqa [16+esp],xmm5 + xor ebp,edx + vmovdqa [32+esp],xmm6 + and esi,ebp + jmp NEAR L$007loop +align 16 +L$007loop: + shrd ebx,ebx,2 + xor esi,edx + vpalignr xmm4,xmm1,xmm0,8 + mov ebp,eax + add edi,DWORD [esp] + vpaddd xmm7,xmm7,xmm3 + vmovdqa [64+esp],xmm0 + xor ebx,ecx + shld eax,eax,5 + vpsrldq xmm6,xmm3,4 + add edi,esi + and ebp,ebx + vpxor xmm4,xmm4,xmm0 + xor ebx,ecx + add edi,eax + vpxor xmm6,xmm6,xmm2 + shrd eax,eax,7 + xor ebp,ecx + vmovdqa [48+esp],xmm7 + mov esi,edi + add edx,DWORD [4+esp] + vpxor xmm4,xmm4,xmm6 + xor eax,ebx + shld edi,edi,5 + add edx,ebp + and esi,eax + vpsrld xmm6,xmm4,31 + xor eax,ebx + add edx,edi + shrd edi,edi,7 + xor esi,ebx + vpslldq xmm0,xmm4,12 + vpaddd xmm4,xmm4,xmm4 + mov ebp,edx + add ecx,DWORD [8+esp] + xor edi,eax + shld edx,edx,5 + vpsrld xmm7,xmm0,30 + vpor xmm4,xmm4,xmm6 + add ecx,esi + and ebp,edi + xor edi,eax + add ecx,edx + vpslld xmm0,xmm0,2 + shrd edx,edx,7 + xor ebp,eax + vpxor xmm4,xmm4,xmm7 + mov esi,ecx + add ebx,DWORD [12+esp] + xor edx,edi + shld ecx,ecx,5 + vpxor xmm4,xmm4,xmm0 + add ebx,ebp + and esi,edx + vmovdqa xmm0,[96+esp] + xor edx,edi + add ebx,ecx + shrd ecx,ecx,7 + xor esi,edi + vpalignr xmm5,xmm2,xmm1,8 + mov ebp,ebx + add eax,DWORD [16+esp] + vpaddd xmm0,xmm0,xmm4 + vmovdqa [80+esp],xmm1 + xor ecx,edx + shld ebx,ebx,5 + vpsrldq xmm7,xmm4,4 + add eax,esi + and ebp,ecx + vpxor xmm5,xmm5,xmm1 + xor ecx,edx + add eax,ebx + vpxor xmm7,xmm7,xmm3 + shrd ebx,ebx,7 + xor ebp,edx + vmovdqa [esp],xmm0 + mov esi,eax + add edi,DWORD [20+esp] + vpxor xmm5,xmm5,xmm7 + xor ebx,ecx + shld eax,eax,5 + add edi,ebp + and esi,ebx + vpsrld xmm7,xmm5,31 + xor ebx,ecx + add edi,eax + shrd eax,eax,7 + xor esi,ecx + vpslldq xmm1,xmm5,12 + vpaddd xmm5,xmm5,xmm5 + mov ebp,edi + add edx,DWORD [24+esp] + xor eax,ebx + shld edi,edi,5 + vpsrld xmm0,xmm1,30 + vpor xmm5,xmm5,xmm7 + add edx,esi + and ebp,eax + xor eax,ebx + add edx,edi + vpslld xmm1,xmm1,2 + shrd edi,edi,7 + xor ebp,ebx + vpxor xmm5,xmm5,xmm0 + mov esi,edx + add ecx,DWORD [28+esp] + xor edi,eax + shld edx,edx,5 + vpxor xmm5,xmm5,xmm1 + add ecx,ebp + and esi,edi + vmovdqa xmm1,[112+esp] + xor edi,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + vpalignr xmm6,xmm3,xmm2,8 + mov ebp,ecx + add ebx,DWORD [32+esp] + vpaddd xmm1,xmm1,xmm5 + vmovdqa [96+esp],xmm2 + xor edx,edi + shld ecx,ecx,5 + vpsrldq xmm0,xmm5,4 + add ebx,esi + and ebp,edx + vpxor xmm6,xmm6,xmm2 + xor edx,edi + add ebx,ecx + vpxor xmm0,xmm0,xmm4 + shrd ecx,ecx,7 + xor ebp,edi + vmovdqa [16+esp],xmm1 + mov esi,ebx + add eax,DWORD [36+esp] + vpxor xmm6,xmm6,xmm0 + xor ecx,edx + shld ebx,ebx,5 + add eax,ebp + and esi,ecx + vpsrld xmm0,xmm6,31 + xor ecx,edx + add eax,ebx + shrd ebx,ebx,7 + xor esi,edx + vpslldq xmm2,xmm6,12 + vpaddd xmm6,xmm6,xmm6 + mov ebp,eax + add edi,DWORD [40+esp] + xor ebx,ecx + shld eax,eax,5 + vpsrld xmm1,xmm2,30 + vpor xmm6,xmm6,xmm0 + add edi,esi + and ebp,ebx + xor ebx,ecx + add edi,eax + vpslld xmm2,xmm2,2 + vmovdqa xmm0,[64+esp] + shrd eax,eax,7 + xor ebp,ecx + vpxor xmm6,xmm6,xmm1 + mov esi,edi + add edx,DWORD [44+esp] + xor eax,ebx + shld edi,edi,5 + vpxor xmm6,xmm6,xmm2 + add edx,ebp + and esi,eax + vmovdqa xmm2,[112+esp] + xor eax,ebx + add edx,edi + shrd edi,edi,7 + xor esi,ebx + vpalignr xmm7,xmm4,xmm3,8 + mov ebp,edx + add ecx,DWORD [48+esp] + vpaddd xmm2,xmm2,xmm6 + vmovdqa [64+esp],xmm3 + xor edi,eax + shld edx,edx,5 + vpsrldq xmm1,xmm6,4 + add ecx,esi + and ebp,edi + vpxor xmm7,xmm7,xmm3 + xor edi,eax + add ecx,edx + vpxor xmm1,xmm1,xmm5 + shrd edx,edx,7 + xor ebp,eax + vmovdqa [32+esp],xmm2 + mov esi,ecx + add ebx,DWORD [52+esp] + vpxor xmm7,xmm7,xmm1 + xor edx,edi + shld ecx,ecx,5 + add ebx,ebp + and esi,edx + vpsrld xmm1,xmm7,31 + xor edx,edi + add ebx,ecx + shrd ecx,ecx,7 + xor esi,edi + vpslldq xmm3,xmm7,12 + vpaddd xmm7,xmm7,xmm7 + mov ebp,ebx + add eax,DWORD [56+esp] + xor ecx,edx + shld ebx,ebx,5 + vpsrld xmm2,xmm3,30 + vpor xmm7,xmm7,xmm1 + add eax,esi + and ebp,ecx + xor ecx,edx + add eax,ebx + vpslld xmm3,xmm3,2 + vmovdqa xmm1,[80+esp] + shrd ebx,ebx,7 + xor ebp,edx + vpxor xmm7,xmm7,xmm2 + mov esi,eax + add edi,DWORD [60+esp] + xor ebx,ecx + shld eax,eax,5 + vpxor xmm7,xmm7,xmm3 + add edi,ebp + and esi,ebx + vmovdqa xmm3,[112+esp] + xor ebx,ecx + add edi,eax + vpalignr xmm2,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + shrd eax,eax,7 + xor esi,ecx + mov ebp,edi + add edx,DWORD [esp] + vpxor xmm0,xmm0,xmm1 + vmovdqa [80+esp],xmm4 + xor eax,ebx + shld edi,edi,5 + vmovdqa xmm4,xmm3 + vpaddd xmm3,xmm3,xmm7 + add edx,esi + and ebp,eax + vpxor xmm0,xmm0,xmm2 + xor eax,ebx + add edx,edi + shrd edi,edi,7 + xor ebp,ebx + vpsrld xmm2,xmm0,30 + vmovdqa [48+esp],xmm3 + mov esi,edx + add ecx,DWORD [4+esp] + xor edi,eax + shld edx,edx,5 + vpslld xmm0,xmm0,2 + add ecx,ebp + and esi,edi + xor edi,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + mov ebp,ecx + add ebx,DWORD [8+esp] + vpor xmm0,xmm0,xmm2 + xor edx,edi + shld ecx,ecx,5 + vmovdqa xmm2,[96+esp] + add ebx,esi + and ebp,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [12+esp] + xor ebp,edi + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpalignr xmm3,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add edi,DWORD [16+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + vpxor xmm1,xmm1,xmm2 + vmovdqa [96+esp],xmm5 + add edi,esi + xor ebp,ecx + vmovdqa xmm5,xmm4 + vpaddd xmm4,xmm4,xmm0 + shrd ebx,ebx,7 + add edi,eax + vpxor xmm1,xmm1,xmm3 + add edx,DWORD [20+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + vpsrld xmm3,xmm1,30 + vmovdqa [esp],xmm4 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + vpslld xmm1,xmm1,2 + add ecx,DWORD [24+esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + vpor xmm1,xmm1,xmm3 + add ebx,DWORD [28+esp] + xor ebp,edi + vmovdqa xmm3,[64+esp] + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + vpalignr xmm4,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add eax,DWORD [32+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + vpxor xmm2,xmm2,xmm3 + vmovdqa [64+esp],xmm6 + add eax,esi + xor ebp,edx + vmovdqa xmm6,[128+esp] + vpaddd xmm5,xmm5,xmm1 + shrd ecx,ecx,7 + add eax,ebx + vpxor xmm2,xmm2,xmm4 + add edi,DWORD [36+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + vpsrld xmm4,xmm2,30 + vmovdqa [16+esp],xmm5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + vpslld xmm2,xmm2,2 + add edx,DWORD [40+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + vpor xmm2,xmm2,xmm4 + add ecx,DWORD [44+esp] + xor ebp,eax + vmovdqa xmm4,[80+esp] + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + vpalignr xmm5,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebx,DWORD [48+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + vpxor xmm3,xmm3,xmm4 + vmovdqa [80+esp],xmm7 + add ebx,esi + xor ebp,edi + vmovdqa xmm7,xmm6 + vpaddd xmm6,xmm6,xmm2 + shrd edx,edx,7 + add ebx,ecx + vpxor xmm3,xmm3,xmm5 + add eax,DWORD [52+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + vpsrld xmm5,xmm3,30 + vmovdqa [32+esp],xmm6 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpslld xmm3,xmm3,2 + add edi,DWORD [56+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + add edi,esi + xor ebp,ecx + shrd ebx,ebx,7 + add edi,eax + vpor xmm3,xmm3,xmm5 + add edx,DWORD [60+esp] + xor ebp,ebx + vmovdqa xmm5,[96+esp] + mov esi,edi + shld edi,edi,5 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + vpalignr xmm6,xmm3,xmm2,8 + vpxor xmm4,xmm4,xmm0 + add ecx,DWORD [esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + vpxor xmm4,xmm4,xmm5 + vmovdqa [96+esp],xmm0 + add ecx,esi + xor ebp,eax + vmovdqa xmm0,xmm7 + vpaddd xmm7,xmm7,xmm3 + shrd edi,edi,7 + add ecx,edx + vpxor xmm4,xmm4,xmm6 + add ebx,DWORD [4+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + vpsrld xmm6,xmm4,30 + vmovdqa [48+esp],xmm7 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + vpslld xmm4,xmm4,2 + add eax,DWORD [8+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + vpor xmm4,xmm4,xmm6 + add edi,DWORD [12+esp] + xor ebp,ecx + vmovdqa xmm6,[64+esp] + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + vpalignr xmm7,xmm4,xmm3,8 + vpxor xmm5,xmm5,xmm1 + add edx,DWORD [16+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + vpxor xmm5,xmm5,xmm6 + vmovdqa [64+esp],xmm1 + add edx,esi + xor ebp,ebx + vmovdqa xmm1,xmm0 + vpaddd xmm0,xmm0,xmm4 + shrd eax,eax,7 + add edx,edi + vpxor xmm5,xmm5,xmm7 + add ecx,DWORD [20+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + vpsrld xmm7,xmm5,30 + vmovdqa [esp],xmm0 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + vpslld xmm5,xmm5,2 + add ebx,DWORD [24+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + vpor xmm5,xmm5,xmm7 + add eax,DWORD [28+esp] + vmovdqa xmm7,[80+esp] + shrd ecx,ecx,7 + mov esi,ebx + xor ebp,edx + shld ebx,ebx,5 + add eax,ebp + xor esi,ecx + xor ecx,edx + add eax,ebx + vpalignr xmm0,xmm5,xmm4,8 + vpxor xmm6,xmm6,xmm2 + add edi,DWORD [32+esp] + and esi,ecx + xor ecx,edx + shrd ebx,ebx,7 + vpxor xmm6,xmm6,xmm7 + vmovdqa [80+esp],xmm2 + mov ebp,eax + xor esi,ecx + vmovdqa xmm2,xmm1 + vpaddd xmm1,xmm1,xmm5 + shld eax,eax,5 + add edi,esi + vpxor xmm6,xmm6,xmm0 + xor ebp,ebx + xor ebx,ecx + add edi,eax + add edx,DWORD [36+esp] + vpsrld xmm0,xmm6,30 + vmovdqa [16+esp],xmm1 + and ebp,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,edi + vpslld xmm6,xmm6,2 + xor ebp,ebx + shld edi,edi,5 + add edx,ebp + xor esi,eax + xor eax,ebx + add edx,edi + add ecx,DWORD [40+esp] + and esi,eax + vpor xmm6,xmm6,xmm0 + xor eax,ebx + shrd edi,edi,7 + vmovdqa xmm0,[96+esp] + mov ebp,edx + xor esi,eax + shld edx,edx,5 + add ecx,esi + xor ebp,edi + xor edi,eax + add ecx,edx + add ebx,DWORD [44+esp] + and ebp,edi + xor edi,eax + shrd edx,edx,7 + mov esi,ecx + xor ebp,edi + shld ecx,ecx,5 + add ebx,ebp + xor esi,edx + xor edx,edi + add ebx,ecx + vpalignr xmm1,xmm6,xmm5,8 + vpxor xmm7,xmm7,xmm3 + add eax,DWORD [48+esp] + and esi,edx + xor edx,edi + shrd ecx,ecx,7 + vpxor xmm7,xmm7,xmm0 + vmovdqa [96+esp],xmm3 + mov ebp,ebx + xor esi,edx + vmovdqa xmm3,[144+esp] + vpaddd xmm2,xmm2,xmm6 + shld ebx,ebx,5 + add eax,esi + vpxor xmm7,xmm7,xmm1 + xor ebp,ecx + xor ecx,edx + add eax,ebx + add edi,DWORD [52+esp] + vpsrld xmm1,xmm7,30 + vmovdqa [32+esp],xmm2 + and ebp,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + vpslld xmm7,xmm7,2 + xor ebp,ecx + shld eax,eax,5 + add edi,ebp + xor esi,ebx + xor ebx,ecx + add edi,eax + add edx,DWORD [56+esp] + and esi,ebx + vpor xmm7,xmm7,xmm1 + xor ebx,ecx + shrd eax,eax,7 + vmovdqa xmm1,[64+esp] + mov ebp,edi + xor esi,ebx + shld edi,edi,5 + add edx,esi + xor ebp,eax + xor eax,ebx + add edx,edi + add ecx,DWORD [60+esp] + and ebp,eax + xor eax,ebx + shrd edi,edi,7 + mov esi,edx + xor ebp,eax + shld edx,edx,5 + add ecx,ebp + xor esi,edi + xor edi,eax + add ecx,edx + vpalignr xmm2,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + add ebx,DWORD [esp] + and esi,edi + xor edi,eax + shrd edx,edx,7 + vpxor xmm0,xmm0,xmm1 + vmovdqa [64+esp],xmm4 + mov ebp,ecx + xor esi,edi + vmovdqa xmm4,xmm3 + vpaddd xmm3,xmm3,xmm7 + shld ecx,ecx,5 + add ebx,esi + vpxor xmm0,xmm0,xmm2 + xor ebp,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [4+esp] + vpsrld xmm2,xmm0,30 + vmovdqa [48+esp],xmm3 + and ebp,edx + xor edx,edi + shrd ecx,ecx,7 + mov esi,ebx + vpslld xmm0,xmm0,2 + xor ebp,edx + shld ebx,ebx,5 + add eax,ebp + xor esi,ecx + xor ecx,edx + add eax,ebx + add edi,DWORD [8+esp] + and esi,ecx + vpor xmm0,xmm0,xmm2 + xor ecx,edx + shrd ebx,ebx,7 + vmovdqa xmm2,[80+esp] + mov ebp,eax + xor esi,ecx + shld eax,eax,5 + add edi,esi + xor ebp,ebx + xor ebx,ecx + add edi,eax + add edx,DWORD [12+esp] + and ebp,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,edi + xor ebp,ebx + shld edi,edi,5 + add edx,ebp + xor esi,eax + xor eax,ebx + add edx,edi + vpalignr xmm3,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ecx,DWORD [16+esp] + and esi,eax + xor eax,ebx + shrd edi,edi,7 + vpxor xmm1,xmm1,xmm2 + vmovdqa [80+esp],xmm5 + mov ebp,edx + xor esi,eax + vmovdqa xmm5,xmm4 + vpaddd xmm4,xmm4,xmm0 + shld edx,edx,5 + add ecx,esi + vpxor xmm1,xmm1,xmm3 + xor ebp,edi + xor edi,eax + add ecx,edx + add ebx,DWORD [20+esp] + vpsrld xmm3,xmm1,30 + vmovdqa [esp],xmm4 + and ebp,edi + xor edi,eax + shrd edx,edx,7 + mov esi,ecx + vpslld xmm1,xmm1,2 + xor ebp,edi + shld ecx,ecx,5 + add ebx,ebp + xor esi,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [24+esp] + and esi,edx + vpor xmm1,xmm1,xmm3 + xor edx,edi + shrd ecx,ecx,7 + vmovdqa xmm3,[96+esp] + mov ebp,ebx + xor esi,edx + shld ebx,ebx,5 + add eax,esi + xor ebp,ecx + xor ecx,edx + add eax,ebx + add edi,DWORD [28+esp] + and ebp,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + xor ebp,ecx + shld eax,eax,5 + add edi,ebp + xor esi,ebx + xor ebx,ecx + add edi,eax + vpalignr xmm4,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add edx,DWORD [32+esp] + and esi,ebx + xor ebx,ecx + shrd eax,eax,7 + vpxor xmm2,xmm2,xmm3 + vmovdqa [96+esp],xmm6 + mov ebp,edi + xor esi,ebx + vmovdqa xmm6,xmm5 + vpaddd xmm5,xmm5,xmm1 + shld edi,edi,5 + add edx,esi + vpxor xmm2,xmm2,xmm4 + xor ebp,eax + xor eax,ebx + add edx,edi + add ecx,DWORD [36+esp] + vpsrld xmm4,xmm2,30 + vmovdqa [16+esp],xmm5 + and ebp,eax + xor eax,ebx + shrd edi,edi,7 + mov esi,edx + vpslld xmm2,xmm2,2 + xor ebp,eax + shld edx,edx,5 + add ecx,ebp + xor esi,edi + xor edi,eax + add ecx,edx + add ebx,DWORD [40+esp] + and esi,edi + vpor xmm2,xmm2,xmm4 + xor edi,eax + shrd edx,edx,7 + vmovdqa xmm4,[64+esp] + mov ebp,ecx + xor esi,edi + shld ecx,ecx,5 + add ebx,esi + xor ebp,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [44+esp] + and ebp,edx + xor edx,edi + shrd ecx,ecx,7 + mov esi,ebx + xor ebp,edx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + add eax,ebx + vpalignr xmm5,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add edi,DWORD [48+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + vpxor xmm3,xmm3,xmm4 + vmovdqa [64+esp],xmm7 + add edi,esi + xor ebp,ecx + vmovdqa xmm7,xmm6 + vpaddd xmm6,xmm6,xmm2 + shrd ebx,ebx,7 + add edi,eax + vpxor xmm3,xmm3,xmm5 + add edx,DWORD [52+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + vpsrld xmm5,xmm3,30 + vmovdqa [32+esp],xmm6 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + vpslld xmm3,xmm3,2 + add ecx,DWORD [56+esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + vpor xmm3,xmm3,xmm5 + add ebx,DWORD [60+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [esp] + vpaddd xmm7,xmm7,xmm3 + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + vmovdqa [48+esp],xmm7 + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [4+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [8+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [12+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + mov ebp,DWORD [196+esp] + cmp ebp,DWORD [200+esp] + je NEAR L$008done + vmovdqa xmm7,[160+esp] + vmovdqa xmm6,[176+esp] + vmovdqu xmm0,[ebp] + vmovdqu xmm1,[16+ebp] + vmovdqu xmm2,[32+ebp] + vmovdqu xmm3,[48+ebp] + add ebp,64 + vpshufb xmm0,xmm0,xmm6 + mov DWORD [196+esp],ebp + vmovdqa [96+esp],xmm7 + add ebx,DWORD [16+esp] + xor esi,edi + vpshufb xmm1,xmm1,xmm6 + mov ebp,ecx + shld ecx,ecx,5 + vpaddd xmm4,xmm0,xmm7 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + vmovdqa [esp],xmm4 + add eax,DWORD [20+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [24+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + add edi,esi + xor ebp,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [28+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [32+esp] + xor esi,eax + vpshufb xmm2,xmm2,xmm6 + mov ebp,edx + shld edx,edx,5 + vpaddd xmm5,xmm1,xmm7 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + vmovdqa [16+esp],xmm5 + add ebx,DWORD [36+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [40+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [44+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [48+esp] + xor esi,ebx + vpshufb xmm3,xmm3,xmm6 + mov ebp,edi + shld edi,edi,5 + vpaddd xmm6,xmm2,xmm7 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + vmovdqa [32+esp],xmm6 + add ecx,DWORD [52+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + add ebx,DWORD [56+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [60+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + shrd ecx,ecx,7 + add eax,ebx + mov ebp,DWORD [192+esp] + add eax,DWORD [ebp] + add esi,DWORD [4+ebp] + add ecx,DWORD [8+ebp] + mov DWORD [ebp],eax + add edx,DWORD [12+ebp] + mov DWORD [4+ebp],esi + add edi,DWORD [16+ebp] + mov ebx,ecx + mov DWORD [8+ebp],ecx + xor ebx,edx + mov DWORD [12+ebp],edx + mov DWORD [16+ebp],edi + mov ebp,esi + and esi,ebx + mov ebx,ebp + jmp NEAR L$007loop +align 16 +L$008done: + add ebx,DWORD [16+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [20+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [24+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + add edi,esi + xor ebp,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [28+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [32+esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + add ebx,DWORD [36+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [40+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [44+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [48+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [52+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + add ebx,DWORD [56+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [60+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + shrd ecx,ecx,7 + add eax,ebx + vzeroall + mov ebp,DWORD [192+esp] + add eax,DWORD [ebp] + mov esp,DWORD [204+esp] + add esi,DWORD [4+ebp] + add ecx,DWORD [8+ebp] + mov DWORD [ebp],eax + add edx,DWORD [12+ebp] + mov DWORD [4+ebp],esi + add edi,DWORD [16+ebp] + mov DWORD [8+ebp],ecx + mov DWORD [12+ebp],edx + mov DWORD [16+ebp],edi + pop edi + pop esi + pop ebx + pop ebp + ret align 64 L$K_XX_XX: dd 1518500249,1518500249,1518500249,1518500249 diff --git a/packager/third_party/boringssl/win-x86/crypto/sha/sha256-586.asm b/packager/third_party/boringssl/win-x86/crypto/sha/sha256-586.asm index fe36bc5c9a..3e7cfcca07 100644 --- a/packager/third_party/boringssl/win-x86/crypto/sha/sha256-586.asm +++ b/packager/third_party/boringssl/win-x86/crypto/sha/sha256-586.asm @@ -49,11 +49,10 @@ L$000pic_point: jz NEAR L$003no_xmm and ecx,1073741824 and ebx,268435968 - test edx,536870912 - jnz NEAR L$004shaext or ecx,ebx and ecx,1342177280 cmp ecx,1342177280 + je NEAR L$004AVX test ebx,512 jnz NEAR L$005SSSE3 L$003no_xmm: @@ -3179,204 +3178,6 @@ L$009grand_loop: pop ebp ret align 32 -L$004shaext: - sub esp,32 - movdqu xmm1,[esi] - lea ebp,[128+ebp] - movdqu xmm2,[16+esi] - movdqa xmm7,[128+ebp] - pshufd xmm0,xmm1,27 - pshufd xmm1,xmm1,177 - pshufd xmm2,xmm2,27 -db 102,15,58,15,202,8 - punpcklqdq xmm2,xmm0 - jmp NEAR L$010loop_shaext -align 16 -L$010loop_shaext: - movdqu xmm3,[edi] - movdqu xmm4,[16+edi] - movdqu xmm5,[32+edi] -db 102,15,56,0,223 - movdqu xmm6,[48+edi] - movdqa [16+esp],xmm2 - movdqa xmm0,[ebp-128] - paddd xmm0,xmm3 -db 102,15,56,0,231 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - nop - movdqa [esp],xmm1 -db 15,56,203,202 - movdqa xmm0,[ebp-112] - paddd xmm0,xmm4 -db 102,15,56,0,239 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - lea edi,[64+edi] -db 15,56,204,220 -db 15,56,203,202 - movdqa xmm0,[ebp-96] - paddd xmm0,xmm5 -db 102,15,56,0,247 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm6 -db 102,15,58,15,253,4 - nop - paddd xmm3,xmm7 -db 15,56,204,229 -db 15,56,203,202 - movdqa xmm0,[ebp-80] - paddd xmm0,xmm6 -db 15,56,205,222 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm3 -db 102,15,58,15,254,4 - nop - paddd xmm4,xmm7 -db 15,56,204,238 -db 15,56,203,202 - movdqa xmm0,[ebp-64] - paddd xmm0,xmm3 -db 15,56,205,227 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm4 -db 102,15,58,15,251,4 - nop - paddd xmm5,xmm7 -db 15,56,204,243 -db 15,56,203,202 - movdqa xmm0,[ebp-48] - paddd xmm0,xmm4 -db 15,56,205,236 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm5 -db 102,15,58,15,252,4 - nop - paddd xmm6,xmm7 -db 15,56,204,220 -db 15,56,203,202 - movdqa xmm0,[ebp-32] - paddd xmm0,xmm5 -db 15,56,205,245 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm6 -db 102,15,58,15,253,4 - nop - paddd xmm3,xmm7 -db 15,56,204,229 -db 15,56,203,202 - movdqa xmm0,[ebp-16] - paddd xmm0,xmm6 -db 15,56,205,222 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm3 -db 102,15,58,15,254,4 - nop - paddd xmm4,xmm7 -db 15,56,204,238 -db 15,56,203,202 - movdqa xmm0,[ebp] - paddd xmm0,xmm3 -db 15,56,205,227 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm4 -db 102,15,58,15,251,4 - nop - paddd xmm5,xmm7 -db 15,56,204,243 -db 15,56,203,202 - movdqa xmm0,[16+ebp] - paddd xmm0,xmm4 -db 15,56,205,236 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm5 -db 102,15,58,15,252,4 - nop - paddd xmm6,xmm7 -db 15,56,204,220 -db 15,56,203,202 - movdqa xmm0,[32+ebp] - paddd xmm0,xmm5 -db 15,56,205,245 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm6 -db 102,15,58,15,253,4 - nop - paddd xmm3,xmm7 -db 15,56,204,229 -db 15,56,203,202 - movdqa xmm0,[48+ebp] - paddd xmm0,xmm6 -db 15,56,205,222 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm3 -db 102,15,58,15,254,4 - nop - paddd xmm4,xmm7 -db 15,56,204,238 -db 15,56,203,202 - movdqa xmm0,[64+ebp] - paddd xmm0,xmm3 -db 15,56,205,227 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm4 -db 102,15,58,15,251,4 - nop - paddd xmm5,xmm7 -db 15,56,204,243 -db 15,56,203,202 - movdqa xmm0,[80+ebp] - paddd xmm0,xmm4 -db 15,56,205,236 -db 15,56,203,209 - pshufd xmm0,xmm0,14 - movdqa xmm7,xmm5 -db 102,15,58,15,252,4 -db 15,56,203,202 - paddd xmm6,xmm7 - movdqa xmm0,[96+ebp] - paddd xmm0,xmm5 -db 15,56,203,209 - pshufd xmm0,xmm0,14 -db 15,56,205,245 - movdqa xmm7,[128+ebp] -db 15,56,203,202 - movdqa xmm0,[112+ebp] - paddd xmm0,xmm6 - nop -db 15,56,203,209 - pshufd xmm0,xmm0,14 - cmp eax,edi - nop -db 15,56,203,202 - paddd xmm2,[16+esp] - paddd xmm1,[esp] - jnz NEAR L$010loop_shaext - pshufd xmm2,xmm2,177 - pshufd xmm7,xmm1,27 - pshufd xmm1,xmm1,177 - punpckhqdq xmm1,xmm2 -db 102,15,58,15,215,8 - mov esp,DWORD [44+esp] - movdqu [esi],xmm1 - movdqu [16+esi],xmm2 - pop edi - pop esi - pop ebx - pop ebp - ret -align 32 L$005SSSE3: lea esp,[esp-96] mov eax,DWORD [esi] @@ -3396,9 +3197,9 @@ L$005SSSE3: mov DWORD [24+esp],ecx mov DWORD [28+esp],esi movdqa xmm7,[256+ebp] - jmp NEAR L$011grand_ssse3 + jmp NEAR L$010grand_ssse3 align 16 -L$011grand_ssse3: +L$010grand_ssse3: movdqu xmm0,[edi] movdqu xmm1,[16+edi] movdqu xmm2,[32+edi] @@ -3421,9 +3222,9 @@ db 102,15,56,0,223 paddd xmm7,xmm3 movdqa [64+esp],xmm6 movdqa [80+esp],xmm7 - jmp NEAR L$012ssse3_00_47 + jmp NEAR L$011ssse3_00_47 align 16 -L$012ssse3_00_47: +L$011ssse3_00_47: add ebp,64 mov ecx,edx movdqa xmm4,xmm1 @@ -4066,7 +3867,7 @@ db 102,15,58,15,249,4 add eax,ecx movdqa [80+esp],xmm6 cmp DWORD [64+ebp],66051 - jne NEAR L$012ssse3_00_47 + jne NEAR L$011ssse3_00_47 mov ecx,edx ror edx,14 mov esi,DWORD [20+esp] @@ -4580,12 +4381,1193 @@ db 102,15,58,15,249,4 movdqa xmm7,[64+ebp] sub ebp,192 cmp edi,DWORD [104+esp] - jb NEAR L$011grand_ssse3 + jb NEAR L$010grand_ssse3 mov esp,DWORD [108+esp] pop edi pop esi pop ebx pop ebp ret +align 32 +L$004AVX: + lea esp,[esp-96] + vzeroall + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edi,DWORD [12+esi] + mov DWORD [4+esp],ebx + xor ebx,ecx + mov DWORD [8+esp],ecx + mov DWORD [12+esp],edi + mov edx,DWORD [16+esi] + mov edi,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov esi,DWORD [28+esi] + mov DWORD [20+esp],edi + mov edi,DWORD [100+esp] + mov DWORD [24+esp],ecx + mov DWORD [28+esp],esi + vmovdqa xmm7,[256+ebp] + jmp NEAR L$012grand_avx +align 32 +L$012grand_avx: + vmovdqu xmm0,[edi] + vmovdqu xmm1,[16+edi] + vmovdqu xmm2,[32+edi] + vmovdqu xmm3,[48+edi] + add edi,64 + vpshufb xmm0,xmm0,xmm7 + mov DWORD [100+esp],edi + vpshufb xmm1,xmm1,xmm7 + vpshufb xmm2,xmm2,xmm7 + vpaddd xmm4,xmm0,[ebp] + vpshufb xmm3,xmm3,xmm7 + vpaddd xmm5,xmm1,[16+ebp] + vpaddd xmm6,xmm2,[32+ebp] + vpaddd xmm7,xmm3,[48+ebp] + vmovdqa [32+esp],xmm4 + vmovdqa [48+esp],xmm5 + vmovdqa [64+esp],xmm6 + vmovdqa [80+esp],xmm7 + jmp NEAR L$013avx_00_47 +align 16 +L$013avx_00_47: + add ebp,64 + vpalignr xmm4,xmm1,xmm0,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + vpalignr xmm7,xmm3,xmm2,4 + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + vpaddd xmm0,xmm0,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + vpshufd xmm7,xmm3,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [32+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + vpaddd xmm0,xmm0,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [36+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + vpaddd xmm0,xmm0,xmm7 + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + vpshufd xmm7,xmm0,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [40+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + vpaddd xmm0,xmm0,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + vpaddd xmm6,xmm0,[ebp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [44+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + vmovdqa [32+esp],xmm6 + vpalignr xmm4,xmm2,xmm1,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + vpalignr xmm7,xmm0,xmm3,4 + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + vpaddd xmm1,xmm1,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + vpshufd xmm7,xmm0,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [48+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + vpaddd xmm1,xmm1,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [52+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + vpaddd xmm1,xmm1,xmm7 + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + vpshufd xmm7,xmm1,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [56+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + vpaddd xmm1,xmm1,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + vpaddd xmm6,xmm1,[16+ebp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [60+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + vmovdqa [48+esp],xmm6 + vpalignr xmm4,xmm3,xmm2,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + vpalignr xmm7,xmm1,xmm0,4 + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + vpaddd xmm2,xmm2,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + vpshufd xmm7,xmm1,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [64+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + vpaddd xmm2,xmm2,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [68+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + vpaddd xmm2,xmm2,xmm7 + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + vpshufd xmm7,xmm2,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [72+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + vpaddd xmm2,xmm2,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + vpaddd xmm6,xmm2,[32+ebp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [76+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + vmovdqa [64+esp],xmm6 + vpalignr xmm4,xmm0,xmm3,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + vpalignr xmm7,xmm2,xmm1,4 + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + vpaddd xmm3,xmm3,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + vpshufd xmm7,xmm2,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [80+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + vpaddd xmm3,xmm3,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [84+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + vpaddd xmm3,xmm3,xmm7 + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + vpshufd xmm7,xmm3,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [88+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + vpaddd xmm3,xmm3,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + vpaddd xmm6,xmm3,[48+ebp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [92+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + vmovdqa [80+esp],xmm6 + cmp DWORD [64+ebp],66051 + jne NEAR L$013avx_00_47 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [32+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [36+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [40+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [44+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [48+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [52+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [56+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [60+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [64+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [68+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [72+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [76+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [80+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [84+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [88+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [92+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + mov esi,DWORD [96+esp] + xor ebx,edi + mov ecx,DWORD [12+esp] + add eax,DWORD [esi] + add ebx,DWORD [4+esi] + add edi,DWORD [8+esi] + add ecx,DWORD [12+esi] + mov DWORD [esi],eax + mov DWORD [4+esi],ebx + mov DWORD [8+esi],edi + mov DWORD [12+esi],ecx + mov DWORD [4+esp],ebx + xor ebx,edi + mov DWORD [8+esp],edi + mov DWORD [12+esp],ecx + mov edi,DWORD [20+esp] + mov ecx,DWORD [24+esp] + add edx,DWORD [16+esi] + add edi,DWORD [20+esi] + add ecx,DWORD [24+esi] + mov DWORD [16+esi],edx + mov DWORD [20+esi],edi + mov DWORD [20+esp],edi + mov edi,DWORD [28+esp] + mov DWORD [24+esi],ecx + add edi,DWORD [28+esi] + mov DWORD [24+esp],ecx + mov DWORD [28+esi],edi + mov DWORD [28+esp],edi + mov edi,DWORD [100+esp] + vmovdqa xmm7,[64+ebp] + sub ebp,192 + cmp edi,DWORD [104+esp] + jb NEAR L$012grand_avx + mov esp,DWORD [108+esp] + vzeroall + pop edi + pop esi + pop ebx + pop ebp + ret segment .bss common _OPENSSL_ia32cap_P 16 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/bn/rsaz-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/bn/rsaz-x86_64.asm index 04d5e3915a..72ec505289 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/bn/rsaz-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/bn/rsaz-x86_64.asm @@ -504,48 +504,104 @@ $L$SEH_begin_rsaz_512_mul_gather4: push r14 push r15 - mov r9d,r9d - sub rsp,128+24 + sub rsp,328 + movaps XMMWORD[160+rsp],xmm6 + movaps XMMWORD[176+rsp],xmm7 + movaps XMMWORD[192+rsp],xmm8 + movaps XMMWORD[208+rsp],xmm9 + movaps XMMWORD[224+rsp],xmm10 + movaps XMMWORD[240+rsp],xmm11 + movaps XMMWORD[256+rsp],xmm12 + movaps XMMWORD[272+rsp],xmm13 + movaps XMMWORD[288+rsp],xmm14 + movaps XMMWORD[304+rsp],xmm15 $L$mul_gather4_body: - mov eax,DWORD[64+r9*4+rdx] -DB 102,72,15,110,199 - mov ebx,DWORD[r9*4+rdx] -DB 102,72,15,110,201 - mov QWORD[128+rsp],r8 + movd xmm8,r9d + movdqa xmm1,XMMWORD[(($L$inc+16))] + movdqa xmm0,XMMWORD[$L$inc] + + pshufd xmm8,xmm8,0 + movdqa xmm7,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm8 + movdqa xmm3,xmm7 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm8 + movdqa xmm4,xmm7 + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm8 + movdqa xmm5,xmm7 + paddd xmm4,xmm3 + pcmpeqd xmm3,xmm8 + movdqa xmm6,xmm7 + paddd xmm5,xmm4 + pcmpeqd xmm4,xmm8 + paddd xmm6,xmm5 + pcmpeqd xmm5,xmm8 + paddd xmm7,xmm6 + pcmpeqd xmm6,xmm8 + pcmpeqd xmm7,xmm8 + + movdqa xmm8,XMMWORD[rdx] + movdqa xmm9,XMMWORD[16+rdx] + movdqa xmm10,XMMWORD[32+rdx] + movdqa xmm11,XMMWORD[48+rdx] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD[64+rdx] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD[80+rdx] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD[96+rdx] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD[112+rdx] + lea rbp,[128+rdx] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,0x4e + por xmm8,xmm9 +DB 102,76,15,126,195 + + mov QWORD[128+rsp],r8 + mov QWORD[((128+8))+rsp],rdi + mov QWORD[((128+16))+rsp],rcx - shl rax,32 - or rbx,rax mov rax,QWORD[rsi] mov rcx,QWORD[8+rsi] - lea rbp,[128+r9*4+rdx] mul rbx mov QWORD[rsp],rax mov rax,rcx mov r8,rdx mul rbx - movd xmm4,DWORD[rbp] add r8,rax mov rax,QWORD[16+rsi] mov r9,rdx adc r9,0 mul rbx - movd xmm5,DWORD[64+rbp] add r9,rax mov rax,QWORD[24+rsi] mov r10,rdx adc r10,0 mul rbx - pslldq xmm5,4 add r10,rax mov rax,QWORD[32+rsi] mov r11,rdx adc r11,0 mul rbx - por xmm4,xmm5 add r11,rax mov rax,QWORD[40+rsi] mov r12,rdx @@ -558,14 +614,12 @@ DB 102,72,15,110,201 adc r13,0 mul rbx - lea rbp,[128+rbp] add r13,rax mov rax,QWORD[56+rsi] mov r14,rdx adc r14,0 mul rbx -DB 102,72,15,126,227 add r14,rax mov rax,QWORD[rsi] mov r15,rdx @@ -577,6 +631,35 @@ DB 102,72,15,126,227 ALIGN 32 $L$oop_mul_gather: + movdqa xmm8,XMMWORD[rbp] + movdqa xmm9,XMMWORD[16+rbp] + movdqa xmm10,XMMWORD[32+rbp] + movdqa xmm11,XMMWORD[48+rbp] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD[64+rbp] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD[80+rbp] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD[96+rbp] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD[112+rbp] + lea rbp,[128+rbp] + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,0x4e + por xmm8,xmm9 +DB 102,76,15,126,195 + mul rbx add r8,rax mov rax,QWORD[8+rsi] @@ -585,7 +668,6 @@ $L$oop_mul_gather: adc r8,0 mul rbx - movd xmm4,DWORD[rbp] add r9,rax mov rax,QWORD[16+rsi] adc rdx,0 @@ -594,7 +676,6 @@ $L$oop_mul_gather: adc r9,0 mul rbx - movd xmm5,DWORD[64+rbp] add r10,rax mov rax,QWORD[24+rsi] adc rdx,0 @@ -603,7 +684,6 @@ $L$oop_mul_gather: adc r10,0 mul rbx - pslldq xmm5,4 add r11,rax mov rax,QWORD[32+rsi] adc rdx,0 @@ -612,7 +692,6 @@ $L$oop_mul_gather: adc r11,0 mul rbx - por xmm4,xmm5 add r12,rax mov rax,QWORD[40+rsi] adc rdx,0 @@ -637,7 +716,6 @@ $L$oop_mul_gather: adc r14,0 mul rbx -DB 102,72,15,126,227 add r15,rax mov rax,QWORD[rsi] adc rdx,0 @@ -645,7 +723,6 @@ DB 102,72,15,126,227 mov r15,rdx adc r15,0 - lea rbp,[128+rbp] lea rdi,[8+rdi] dec ecx @@ -660,8 +737,8 @@ DB 102,72,15,126,227 mov QWORD[48+rdi],r14 mov QWORD[56+rdi],r15 -DB 102,72,15,126,199 -DB 102,72,15,126,205 + mov rdi,QWORD[((128+8))+rsp] + mov rbp,QWORD[((128+16))+rsp] mov r8,QWORD[rsp] mov r9,QWORD[8+rsp] @@ -686,6 +763,17 @@ DB 102,72,15,126,205 call __rsaz_512_subtract lea rax,[((128+24+48))+rsp] + movaps xmm6,XMMWORD[((160-200))+rax] + movaps xmm7,XMMWORD[((176-200))+rax] + movaps xmm8,XMMWORD[((192-200))+rax] + movaps xmm9,XMMWORD[((208-200))+rax] + movaps xmm10,XMMWORD[((224-200))+rax] + movaps xmm11,XMMWORD[((240-200))+rax] + movaps xmm12,XMMWORD[((256-200))+rax] + movaps xmm13,XMMWORD[((272-200))+rax] + movaps xmm14,XMMWORD[((288-200))+rax] + movaps xmm15,XMMWORD[((304-200))+rax] + lea rax,[176+rax] mov r15,QWORD[((-48))+rax] mov r14,QWORD[((-40))+rax] mov r13,QWORD[((-32))+rax] @@ -724,7 +812,7 @@ $L$SEH_begin_rsaz_512_mul_scatter4: mov r9d,r9d sub rsp,128+24 $L$mul_scatter4_body: - lea r8,[r9*4+r8] + lea r8,[r9*8+r8] DB 102,72,15,110,199 DB 102,72,15,110,202 DB 102,73,15,110,208 @@ -760,30 +848,14 @@ DB 102,72,15,126,214 call __rsaz_512_subtract - mov DWORD[rsi],r8d - shr r8,32 - mov DWORD[128+rsi],r9d - shr r9,32 - mov DWORD[256+rsi],r10d - shr r10,32 - mov DWORD[384+rsi],r11d - shr r11,32 - mov DWORD[512+rsi],r12d - shr r12,32 - mov DWORD[640+rsi],r13d - shr r13,32 - mov DWORD[768+rsi],r14d - shr r14,32 - mov DWORD[896+rsi],r15d - shr r15,32 - mov DWORD[64+rsi],r8d - mov DWORD[192+rsi],r9d - mov DWORD[320+rsi],r10d - mov DWORD[448+rsi],r11d - mov DWORD[576+rsi],r12d - mov DWORD[704+rsi],r13d - mov DWORD[832+rsi],r14d - mov DWORD[960+rsi],r15d + mov QWORD[rsi],r8 + mov QWORD[128+rsi],r9 + mov QWORD[256+rsi],r10 + mov QWORD[384+rsi],r11 + mov QWORD[512+rsi],r12 + mov QWORD[640+rsi],r13 + mov QWORD[768+rsi],r14 + mov QWORD[896+rsi],r15 lea rax,[((128+24+48))+rsp] mov r15,QWORD[((-48))+rax] @@ -1150,16 +1222,14 @@ global rsaz_512_scatter4 ALIGN 16 rsaz_512_scatter4: - lea rcx,[r8*4+rcx] + lea rcx,[r8*8+rcx] mov r9d,8 jmp NEAR $L$oop_scatter ALIGN 16 $L$oop_scatter: mov rax,QWORD[rdx] lea rdx,[8+rdx] - mov DWORD[rcx],eax - shr rax,32 - mov DWORD[64+rcx],eax + mov QWORD[rcx],rax lea rcx,[128+rcx] dec r9d jnz NEAR $L$oop_scatter @@ -1170,22 +1240,98 @@ global rsaz_512_gather4 ALIGN 16 rsaz_512_gather4: - lea rdx,[r8*4+rdx] +$L$SEH_begin_rsaz_512_gather4: +DB 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 +DB 0x0f,0x29,0x34,0x24 +DB 0x0f,0x29,0x7c,0x24,0x10 +DB 0x44,0x0f,0x29,0x44,0x24,0x20 +DB 0x44,0x0f,0x29,0x4c,0x24,0x30 +DB 0x44,0x0f,0x29,0x54,0x24,0x40 +DB 0x44,0x0f,0x29,0x5c,0x24,0x50 +DB 0x44,0x0f,0x29,0x64,0x24,0x60 +DB 0x44,0x0f,0x29,0x6c,0x24,0x70 +DB 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 +DB 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 + movd xmm8,r8d + movdqa xmm1,XMMWORD[(($L$inc+16))] + movdqa xmm0,XMMWORD[$L$inc] + + pshufd xmm8,xmm8,0 + movdqa xmm7,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm8 + movdqa xmm3,xmm7 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm8 + movdqa xmm4,xmm7 + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm8 + movdqa xmm5,xmm7 + paddd xmm4,xmm3 + pcmpeqd xmm3,xmm8 + movdqa xmm6,xmm7 + paddd xmm5,xmm4 + pcmpeqd xmm4,xmm8 + paddd xmm6,xmm5 + pcmpeqd xmm5,xmm8 + paddd xmm7,xmm6 + pcmpeqd xmm6,xmm8 + pcmpeqd xmm7,xmm8 mov r9d,8 jmp NEAR $L$oop_gather ALIGN 16 $L$oop_gather: - mov eax,DWORD[rdx] - mov r8d,DWORD[64+rdx] + movdqa xmm8,XMMWORD[rdx] + movdqa xmm9,XMMWORD[16+rdx] + movdqa xmm10,XMMWORD[32+rdx] + movdqa xmm11,XMMWORD[48+rdx] + pand xmm8,xmm0 + movdqa xmm12,XMMWORD[64+rdx] + pand xmm9,xmm1 + movdqa xmm13,XMMWORD[80+rdx] + pand xmm10,xmm2 + movdqa xmm14,XMMWORD[96+rdx] + pand xmm11,xmm3 + movdqa xmm15,XMMWORD[112+rdx] lea rdx,[128+rdx] - shl r8,32 - or rax,r8 - mov QWORD[rcx],rax + pand xmm12,xmm4 + pand xmm13,xmm5 + pand xmm14,xmm6 + pand xmm15,xmm7 + por xmm8,xmm10 + por xmm9,xmm11 + por xmm8,xmm12 + por xmm9,xmm13 + por xmm8,xmm14 + por xmm9,xmm15 + + por xmm8,xmm9 + pshufd xmm9,xmm8,0x4e + por xmm8,xmm9 + movq QWORD[rcx],xmm8 lea rcx,[8+rcx] dec r9d jnz NEAR $L$oop_gather + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + add rsp,0xa8 DB 0F3h,0C3h ;repret +$L$SEH_end_rsaz_512_gather4: + +ALIGN 64 +$L$inc: + DD 0,0,1,1 + DD 2,2,2,2 EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -1221,6 +1367,18 @@ se_handler: lea rax,[((128+24+48))+rax] + lea rbx,[$L$mul_gather4_epilogue] + cmp rbx,r10 + jne NEAR $L$se_not_in_mul_gather4 + + lea rax,[176+rax] + + lea rsi,[((-48-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + +$L$se_not_in_mul_gather4: mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] @@ -1296,6 +1454,10 @@ ALIGN 4 DD $L$SEH_end_rsaz_512_mul_by_one wrt ..imagebase DD $L$SEH_info_rsaz_512_mul_by_one wrt ..imagebase + DD $L$SEH_begin_rsaz_512_gather4 wrt ..imagebase + DD $L$SEH_end_rsaz_512_gather4 wrt ..imagebase + DD $L$SEH_info_rsaz_512_gather4 wrt ..imagebase + section .xdata rdata align=8 ALIGN 8 $L$SEH_info_rsaz_512_sqr: @@ -1318,3 +1480,16 @@ $L$SEH_info_rsaz_512_mul_by_one: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$mul_by_one_body wrt ..imagebase,$L$mul_by_one_epilogue wrt ..imagebase +$L$SEH_info_rsaz_512_gather4: +DB 0x01,0x46,0x16,0x00 +DB 0x46,0xf8,0x09,0x00 +DB 0x3d,0xe8,0x08,0x00 +DB 0x34,0xd8,0x07,0x00 +DB 0x2e,0xc8,0x06,0x00 +DB 0x28,0xb8,0x05,0x00 +DB 0x22,0xa8,0x04,0x00 +DB 0x1c,0x98,0x03,0x00 +DB 0x16,0x88,0x02,0x00 +DB 0x10,0x78,0x01,0x00 +DB 0x0b,0x68,0x00,0x00 +DB 0x07,0x01,0x15,0x00 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont.asm b/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont.asm index db0d1b976f..4d8e1cb72a 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont.asm @@ -677,20 +677,20 @@ $L$sqr8x_enter: - lea r11,[((-64))+r9*4+rsp] + lea r11,[((-64))+r9*2+rsp] mov r8,QWORD[r8] sub r11,rsi and r11,4095 cmp r10,r11 jb NEAR $L$sqr8x_sp_alt sub rsp,r11 - lea rsp,[((-64))+r9*4+rsp] + lea rsp,[((-64))+r9*2+rsp] jmp NEAR $L$sqr8x_sp_done ALIGN 32 $L$sqr8x_sp_alt: - lea r10,[((4096-64))+r9*4] - lea rsp,[((-64))+r9*4+rsp] + lea r10,[((4096-64))+r9*2] + lea rsp,[((-64))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -700,58 +700,80 @@ $L$sqr8x_sp_done: mov r10,r9 neg r9 - lea r11,[64+r9*2+rsp] mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax $L$sqr8x_body: - mov rbp,r9 -DB 102,73,15,110,211 - shr rbp,3+2 - mov eax,DWORD[((OPENSSL_ia32cap_P+8))] - jmp NEAR $L$sqr8x_copy_n - -ALIGN 32 -$L$sqr8x_copy_n: - movq xmm0,QWORD[rcx] - movq xmm1,QWORD[8+rcx] - movq xmm3,QWORD[16+rcx] - movq xmm4,QWORD[24+rcx] - lea rcx,[32+rcx] - movdqa XMMWORD[r11],xmm0 - movdqa XMMWORD[16+r11],xmm1 - movdqa XMMWORD[32+r11],xmm3 - movdqa XMMWORD[48+r11],xmm4 - lea r11,[64+r11] - dec rbp - jnz NEAR $L$sqr8x_copy_n - +DB 102,72,15,110,209 pxor xmm0,xmm0 DB 102,72,15,110,207 DB 102,73,15,110,218 call bn_sqr8x_internal - pxor xmm0,xmm0 - lea rax,[48+rsp] - lea rdx,[64+r9*2+rsp] - shr r9,3+2 - mov rsi,QWORD[40+rsp] - jmp NEAR $L$sqr8x_zero + + + + lea rbx,[r9*1+rdi] + mov rcx,r9 + mov rdx,r9 +DB 102,72,15,126,207 + sar rcx,3+2 + jmp NEAR $L$sqr8x_sub ALIGN 32 -$L$sqr8x_zero: - movdqa XMMWORD[rax],xmm0 - movdqa XMMWORD[16+rax],xmm0 - movdqa XMMWORD[32+rax],xmm0 - movdqa XMMWORD[48+rax],xmm0 - lea rax,[64+rax] - movdqa XMMWORD[rdx],xmm0 - movdqa XMMWORD[16+rdx],xmm0 - movdqa XMMWORD[32+rdx],xmm0 - movdqa XMMWORD[48+rdx],xmm0 - lea rdx,[64+rdx] - dec r9 - jnz NEAR $L$sqr8x_zero +$L$sqr8x_sub: + mov r12,QWORD[rbx] + mov r13,QWORD[8+rbx] + mov r14,QWORD[16+rbx] + mov r15,QWORD[24+rbx] + lea rbx,[32+rbx] + sbb r12,QWORD[rbp] + sbb r13,QWORD[8+rbp] + sbb r14,QWORD[16+rbp] + sbb r15,QWORD[24+rbp] + lea rbp,[32+rbp] + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + lea rdi,[32+rdi] + inc rcx + jnz NEAR $L$sqr8x_sub + + sbb rax,0 + lea rbx,[r9*1+rbx] + lea rdi,[r9*1+rdi] + +DB 102,72,15,110,200 + pxor xmm0,xmm0 + pshufd xmm1,xmm1,0 + mov rsi,QWORD[40+rsp] + jmp NEAR $L$sqr8x_cond_copy + +ALIGN 32 +$L$sqr8x_cond_copy: + movdqa xmm2,XMMWORD[rbx] + movdqa xmm3,XMMWORD[16+rbx] + lea rbx,[32+rbx] + movdqu xmm4,XMMWORD[rdi] + movdqu xmm5,XMMWORD[16+rdi] + lea rdi,[32+rdi] + movdqa XMMWORD[(-32)+rbx],xmm0 + movdqa XMMWORD[(-16)+rbx],xmm0 + movdqa XMMWORD[(-32)+rdx*1+rbx],xmm0 + movdqa XMMWORD[(-16)+rdx*1+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD[(-32)+rdi],xmm4 + movdqu XMMWORD[(-16)+rdi],xmm5 + add r9,32 + jnz NEAR $L$sqr8x_cond_copy mov rax,1 mov r15,QWORD[((-48))+rsi] diff --git a/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm b/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm index 284318aae3..cd9a6e5d4e 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm @@ -31,49 +31,151 @@ ALIGN 16 $L$mul_enter: mov r9d,r9d mov rax,rsp - mov r10d,DWORD[56+rsp] + movd xmm5,DWORD[56+rsp] + lea r10,[$L$inc] push rbx push rbp push r12 push r13 push r14 push r15 - lea rsp,[((-40))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 + lea r11,[2+r9] neg r11 - lea rsp,[r11*8+rsp] + lea rsp,[((-264))+r11*8+rsp] and rsp,-1024 mov QWORD[8+r9*8+rsp],rax $L$mul_body: - mov r12,rdx - mov r11,r10 - shr r10,3 - and r11,7 - not r10 - lea rax,[$L$magic_masks] - and r10,3 - lea r12,[96+r11*8+r12] - movq xmm4,QWORD[r10*8+rax] - movq xmm5,QWORD[8+r10*8+rax] - movq xmm6,QWORD[16+r10*8+rax] - movq xmm7,QWORD[24+r10*8+rax] + lea r12,[128+rdx] + movdqa xmm0,XMMWORD[r10] + movdqa xmm1,XMMWORD[16+r10] + lea r10,[((24-112))+r9*8+rsp] + and r10,-16 - movq xmm0,QWORD[(((-96)))+r12] - movq xmm1,QWORD[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD[32+r12] - pand xmm1,xmm5 - movq xmm3,QWORD[96+r12] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 +DB 0x67 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 +DB 0x67 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + pand xmm0,XMMWORD[64+r12] + + pand xmm1,XMMWORD[80+r12] + pand xmm2,XMMWORD[96+r12] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+r12] por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+r12] + movdqa xmm5,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+r12] + movdqa xmm5,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[r12] + movdqa xmm5,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+r12] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 lea r12,[256+r12] - por xmm0,xmm3 - DB 102,72,15,126,195 mov r8,QWORD[r8] @@ -82,29 +184,14 @@ DB 102,72,15,126,195 xor r14,r14 xor r15,r15 - movq xmm0,QWORD[(((-96)))+r12] - movq xmm1,QWORD[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD[32+r12] - pand xmm1,xmm5 - mov rbp,r8 mul rbx mov r10,rax mov rax,QWORD[rcx] - movq xmm3,QWORD[96+r12] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 - imul rbp,r10 mov r11,rdx - por xmm0,xmm2 - lea r12,[256+r12] - por xmm0,xmm3 - mul rbp add r10,rax mov rax,QWORD[8+rsi] @@ -137,14 +224,12 @@ $L$1st_enter: cmp r15,r9 jne NEAR $L$1st -DB 102,72,15,126,195 add r13,rax - mov rax,QWORD[rsi] adc rdx,0 add r13,r11 adc rdx,0 - mov QWORD[((-16))+r15*8+rsp],r13 + mov QWORD[((-16))+r9*8+rsp],r13 mov r13,rdx mov r11,r10 @@ -158,33 +243,78 @@ DB 102,72,15,126,195 jmp NEAR $L$outer ALIGN 16 $L$outer: + lea rdx,[((24+128))+r9*8+rsp] + and rdx,-16 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r12] + movdqa xmm1,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm0,XMMWORD[((-128))+rdx] + pand xmm1,XMMWORD[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r12] + movdqa xmm1,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm0,XMMWORD[((-64))+rdx] + pand xmm1,XMMWORD[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r12] + movdqa xmm1,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + movdqa xmm3,XMMWORD[48+r12] + pand xmm0,XMMWORD[rdx] + pand xmm1,XMMWORD[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r12] + movdqa xmm1,XMMWORD[80+r12] + movdqa xmm2,XMMWORD[96+r12] + movdqa xmm3,XMMWORD[112+r12] + pand xmm0,XMMWORD[64+rdx] + pand xmm1,XMMWORD[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea r12,[256+r12] + + mov rax,QWORD[rsi] +DB 102,72,15,126,195 + xor r15,r15 mov rbp,r8 mov r10,QWORD[rsp] - movq xmm0,QWORD[(((-96)))+r12] - movq xmm1,QWORD[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD[32+r12] - pand xmm1,xmm5 - mul rbx add r10,rax mov rax,QWORD[rcx] adc rdx,0 - movq xmm3,QWORD[96+r12] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 - imul rbp,r10 mov r11,rdx - por xmm0,xmm2 - lea r12,[256+r12] - por xmm0,xmm3 - mul rbp add r10,rax mov rax,QWORD[8+rsi] @@ -220,15 +350,12 @@ $L$inner_enter: cmp r15,r9 jne NEAR $L$inner -DB 102,72,15,126,195 - add r13,rax - mov rax,QWORD[rsi] adc rdx,0 add r13,r10 - mov r10,QWORD[r15*8+rsp] + mov r10,QWORD[r9*8+rsp] adc rdx,0 - mov QWORD[((-16))+r15*8+rsp],r13 + mov QWORD[((-16))+r9*8+rsp],r13 mov r13,rdx xor rdx,rdx @@ -274,8 +401,7 @@ $L$copy: mov rsi,QWORD[8+r9*8+rsp] mov rax,1 - movaps xmm6,XMMWORD[((-88))+rsi] - movaps xmm7,XMMWORD[((-72))+rsi] + mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] @@ -312,13 +438,10 @@ DB 0x67 push r13 push r14 push r15 - lea rsp,[((-40))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 + DB 0x67 - mov r10d,r9d shl r9d,3 - shl r10d,3+2 + lea r10,[r9*2+r9] neg r9 @@ -328,19 +451,21 @@ DB 0x67 - lea r11,[((-64))+r9*2+rsp] - sub r11,rsi + + + lea r11,[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$mul4xsp_alt sub rsp,r11 - lea rsp,[((-64))+r9*2+rsp] + lea rsp,[((-320))+r9*2+rsp] jmp NEAR $L$mul4xsp_done ALIGN 32 $L$mul4xsp_alt: - lea r10,[((4096-64))+r9*2] - lea rsp,[((-64))+r9*2+rsp] + lea r10,[((4096-320))+r9*2] + lea rsp,[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -356,8 +481,7 @@ $L$mul4x_body: mov rsi,QWORD[40+rsp] mov rax,1 - movaps xmm6,XMMWORD[((-88))+rsi] - movaps xmm7,XMMWORD[((-72))+rsi] + mov r15,QWORD[((-48))+rsi] mov r14,QWORD[((-40))+rsi] mov r13,QWORD[((-32))+rsi] @@ -375,47 +499,141 @@ $L$SEH_end_bn_mul4x_mont_gather5: ALIGN 32 mul4x_internal: shl r9,5 - mov r10d,DWORD[56+rax] - lea r13,[256+r9*1+rdx] + movd xmm5,DWORD[56+rax] + lea rax,[$L$inc] + lea r13,[128+r9*1+rdx] shr r9,5 - mov r11,r10 - shr r10,3 - and r11,7 - not r10 - lea rax,[$L$magic_masks] - and r10,3 - lea r12,[96+r11*8+rdx] - movq xmm4,QWORD[r10*8+rax] - movq xmm5,QWORD[8+r10*8+rax] - add r11,7 - movq xmm6,QWORD[16+r10*8+rax] - movq xmm7,QWORD[24+r10*8+rax] - and r11,7 + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r10,[((88-112))+r9*1+rsp] + lea r12,[128+rdx] - movq xmm0,QWORD[(((-96)))+r12] - lea r14,[256+r12] - movq xmm1,QWORD[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD[32+r12] - pand xmm1,xmm5 - movq xmm3,QWORD[96+r12] - pand xmm2,xmm6 + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 +DB 0x67,0x67 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 DB 0x67 - por xmm0,xmm1 - movq xmm1,QWORD[((-96))+r14] -DB 0x67 - pand xmm3,xmm7 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 DB 0x67 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + pand xmm0,XMMWORD[64+r12] + + pand xmm1,XMMWORD[80+r12] + pand xmm2,XMMWORD[96+r12] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+r12] por xmm0,xmm2 - movq xmm2,QWORD[((-32))+r14] -DB 0x67 - pand xmm1,xmm4 -DB 0x67 - por xmm0,xmm3 - movq xmm3,QWORD[32+r14] - + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+r12] + movdqa xmm5,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+r12] + movdqa xmm5,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[r12] + movdqa xmm5,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+r12] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea r12,[256+r12] DB 102,72,15,126,195 - movq xmm0,QWORD[96+r14] + mov QWORD[((16+8))+rsp],r13 mov QWORD[((56+8))+rsp],rdi @@ -429,26 +647,10 @@ DB 102,72,15,126,195 mov r10,rax mov rax,QWORD[rcx] - pand xmm2,xmm5 - pand xmm3,xmm6 - por xmm1,xmm2 - imul rbp,r10 - - - - - - - - lea r14,[((64+8))+r11*8+rsp] + lea r14,[((64+8))+rsp] mov r11,rdx - pand xmm0,xmm7 - por xmm1,xmm3 - lea r12,[512+r12] - por xmm0,xmm1 - mul rbp add r10,rax mov rax,QWORD[8+r9*1+rsi] @@ -457,7 +659,7 @@ DB 102,72,15,126,195 mul rbx add r11,rax - mov rax,QWORD[16+rcx] + mov rax,QWORD[8+rcx] adc rdx,0 mov r10,rdx @@ -467,7 +669,7 @@ DB 102,72,15,126,195 adc rdx,0 add rdi,r11 lea r15,[32+r9] - lea rcx,[64+rcx] + lea rcx,[32+rcx] adc rdx,0 mov QWORD[r14],rdi mov r13,rdx @@ -477,7 +679,7 @@ ALIGN 32 $L$1st4x: mul rbx add r10,rax - mov rax,QWORD[((-32))+rcx] + mov rax,QWORD[((-16))+rcx] lea r14,[32+r14] adc rdx,0 mov r11,rdx @@ -493,7 +695,7 @@ $L$1st4x: mul rbx add r11,rax - mov rax,QWORD[((-16))+rcx] + mov rax,QWORD[((-8))+rcx] adc rdx,0 mov r10,rdx @@ -523,7 +725,7 @@ $L$1st4x: mul rbx add r11,rax - mov rax,QWORD[16+rcx] + mov rax,QWORD[8+rcx] adc rdx,0 mov r10,rdx @@ -532,7 +734,7 @@ $L$1st4x: mov rax,QWORD[16+r15*1+rsi] adc rdx,0 add rdi,r11 - lea rcx,[64+rcx] + lea rcx,[32+rcx] adc rdx,0 mov QWORD[r14],rdi mov r13,rdx @@ -542,7 +744,7 @@ $L$1st4x: mul rbx add r10,rax - mov rax,QWORD[((-32))+rcx] + mov rax,QWORD[((-16))+rcx] lea r14,[32+r14] adc rdx,0 mov r11,rdx @@ -558,7 +760,7 @@ $L$1st4x: mul rbx add r11,rax - mov rax,QWORD[((-16))+rcx] + mov rax,QWORD[((-8))+rcx] adc rdx,0 mov r10,rdx @@ -571,8 +773,7 @@ $L$1st4x: mov QWORD[((-16))+r14],rdi mov r13,rdx -DB 102,72,15,126,195 - lea rcx,[r9*2+rcx] + lea rcx,[r9*1+rcx] xor rdi,rdi add r13,r10 @@ -583,6 +784,63 @@ DB 102,72,15,126,195 ALIGN 32 $L$outer4x: + lea rdx,[((16+128))+r14] + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r12] + movdqa xmm1,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm0,XMMWORD[((-128))+rdx] + pand xmm1,XMMWORD[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r12] + movdqa xmm1,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm0,XMMWORD[((-64))+rdx] + pand xmm1,XMMWORD[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r12] + movdqa xmm1,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + movdqa xmm3,XMMWORD[48+r12] + pand xmm0,XMMWORD[rdx] + pand xmm1,XMMWORD[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r12] + movdqa xmm1,XMMWORD[80+r12] + movdqa xmm2,XMMWORD[96+r12] + movdqa xmm3,XMMWORD[112+r12] + pand xmm0,XMMWORD[64+rdx] + pand xmm1,XMMWORD[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea r12,[256+r12] +DB 102,72,15,126,195 + mov r10,QWORD[r9*1+r14] mov rbp,r8 mul rbx @@ -590,25 +848,11 @@ $L$outer4x: mov rax,QWORD[rcx] adc rdx,0 - movq xmm0,QWORD[(((-96)))+r12] - movq xmm1,QWORD[((-32))+r12] - pand xmm0,xmm4 - movq xmm2,QWORD[32+r12] - pand xmm1,xmm5 - movq xmm3,QWORD[96+r12] - imul rbp,r10 -DB 0x67 mov r11,rdx mov QWORD[r14],rdi - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 - por xmm0,xmm2 lea r14,[r9*1+r14] - lea r12,[256+r12] - por xmm0,xmm3 mul rbp add r10,rax @@ -618,7 +862,7 @@ DB 0x67 mul rbx add r11,rax - mov rax,QWORD[16+rcx] + mov rax,QWORD[8+rcx] adc rdx,0 add r11,QWORD[8+r14] adc rdx,0 @@ -630,7 +874,7 @@ DB 0x67 adc rdx,0 add rdi,r11 lea r15,[32+r9] - lea rcx,[64+rcx] + lea rcx,[32+rcx] adc rdx,0 mov r13,rdx jmp NEAR $L$inner4x @@ -639,7 +883,7 @@ ALIGN 32 $L$inner4x: mul rbx add r10,rax - mov rax,QWORD[((-32))+rcx] + mov rax,QWORD[((-16))+rcx] adc rdx,0 add r10,QWORD[16+r14] lea r14,[32+r14] @@ -657,7 +901,7 @@ $L$inner4x: mul rbx add r11,rax - mov rax,QWORD[((-16))+rcx] + mov rax,QWORD[((-8))+rcx] adc rdx,0 add r11,QWORD[((-8))+r14] adc rdx,0 @@ -691,7 +935,7 @@ $L$inner4x: mul rbx add r11,rax - mov rax,QWORD[16+rcx] + mov rax,QWORD[8+rcx] adc rdx,0 add r11,QWORD[8+r14] adc rdx,0 @@ -702,7 +946,7 @@ $L$inner4x: mov rax,QWORD[16+r15*1+rsi] adc rdx,0 add rdi,r11 - lea rcx,[64+rcx] + lea rcx,[32+rcx] adc rdx,0 mov QWORD[((-8))+r14],r13 mov r13,rdx @@ -712,7 +956,7 @@ $L$inner4x: mul rbx add r10,rax - mov rax,QWORD[((-32))+rcx] + mov rax,QWORD[((-16))+rcx] adc rdx,0 add r10,QWORD[16+r14] lea r14,[32+r14] @@ -731,7 +975,7 @@ $L$inner4x: mul rbx add r11,rax mov rax,rbp - mov rbp,QWORD[((-16))+rcx] + mov rbp,QWORD[((-8))+rcx] adc rdx,0 add r11,QWORD[((-8))+r14] adc rdx,0 @@ -746,9 +990,8 @@ $L$inner4x: mov QWORD[((-24))+r14],r13 mov r13,rdx -DB 102,72,15,126,195 mov QWORD[((-16))+r14],rdi - lea rcx,[r9*2+rcx] + lea rcx,[r9*1+rcx] xor rdi,rdi add r13,r10 @@ -759,16 +1002,23 @@ DB 102,72,15,126,195 cmp r12,QWORD[((16+8))+rsp] jb NEAR $L$outer4x + xor rax,rax sub rbp,r13 adc r15,r15 or rdi,r15 - xor rdi,1 + sub rax,rdi lea rbx,[r9*1+r14] - lea rbp,[rdi*8+rcx] + mov r12,QWORD[rcx] + lea rbp,[rcx] mov rcx,r9 sar rcx,3+2 mov rdi,QWORD[((56+8))+rsp] - jmp NEAR $L$sqr4x_sub + dec r12 + xor r10,r10 + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqr4x_sub_entry global bn_power5 @@ -793,12 +1043,9 @@ $L$SEH_begin_bn_power5: push r13 push r14 push r15 - lea rsp,[((-40))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 - mov r10d,r9d + shl r9d,3 - shl r10d,3+2 + lea r10d,[r9*2+r9] neg r9 mov r8,QWORD[r8] @@ -808,19 +1055,20 @@ $L$SEH_begin_bn_power5: - lea r11,[((-64))+r9*2+rsp] - sub r11,rsi + + lea r11,[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$pwr_sp_alt sub rsp,r11 - lea rsp,[((-64))+r9*2+rsp] + lea rsp,[((-320))+r9*2+rsp] jmp NEAR $L$pwr_sp_done ALIGN 32 $L$pwr_sp_alt: - lea r10,[((4096-64))+r9*2] - lea rsp,[((-64))+r9*2+rsp] + lea r10,[((4096-320))+r9*2] + lea rsp,[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -848,10 +1096,15 @@ DB 102,73,15,110,218 DB 102,72,15,110,226 call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal call __bn_sqr8x_internal + call __bn_post4x_internal DB 102,72,15,126,209 DB 102,72,15,126,226 @@ -1397,9 +1650,9 @@ DB 0x67 mov QWORD[((-16))+rdi],rbx mov QWORD[((-8))+rdi],r8 DB 102,72,15,126,213 -sqr8x_reduction: +__bn_sqr8x_reduction: xor rax,rax - lea rcx,[r9*2+rbp] + lea rcx,[rbp*1+r9] lea rdx,[((48+8))+r9*2+rsp] mov QWORD[((0+8))+rsp],rcx lea rdi,[((48+8))+r9*1+rsp] @@ -1432,14 +1685,14 @@ DB 0x67 ALIGN 32 $L$8x_reduce: mul rbx - mov rax,QWORD[16+rbp] + mov rax,QWORD[8+rbp] neg r8 mov r8,rdx adc r8,0 mul rbx add r9,rax - mov rax,QWORD[32+rbp] + mov rax,QWORD[16+rbp] adc rdx,0 add r8,r9 mov QWORD[((48-8+8))+rcx*8+rsp],rbx @@ -1448,7 +1701,7 @@ $L$8x_reduce: mul rbx add r10,rax - mov rax,QWORD[48+rbp] + mov rax,QWORD[24+rbp] adc rdx,0 add r9,r10 mov rsi,QWORD[((32+8))+rsp] @@ -1457,7 +1710,7 @@ $L$8x_reduce: mul rbx add r11,rax - mov rax,QWORD[64+rbp] + mov rax,QWORD[32+rbp] adc rdx,0 imul rsi,r8 add r10,r11 @@ -1466,7 +1719,7 @@ $L$8x_reduce: mul rbx add r12,rax - mov rax,QWORD[80+rbp] + mov rax,QWORD[40+rbp] adc rdx,0 add r11,r12 mov r12,rdx @@ -1474,7 +1727,7 @@ $L$8x_reduce: mul rbx add r13,rax - mov rax,QWORD[96+rbp] + mov rax,QWORD[48+rbp] adc rdx,0 add r12,r13 mov r13,rdx @@ -1482,7 +1735,7 @@ $L$8x_reduce: mul rbx add r14,rax - mov rax,QWORD[112+rbp] + mov rax,QWORD[56+rbp] adc rdx,0 add r13,r14 mov r14,rdx @@ -1500,7 +1753,7 @@ $L$8x_reduce: dec ecx jnz NEAR $L$8x_reduce - lea rbp,[128+rbp] + lea rbp,[64+rbp] xor rax,rax mov rdx,QWORD[((8+8))+rsp] cmp rbp,QWORD[((0+8))+rsp] @@ -1526,14 +1779,14 @@ ALIGN 32 $L$8x_tail: mul rbx add r8,rax - mov rax,QWORD[16+rbp] + mov rax,QWORD[8+rbp] mov QWORD[rdi],r8 mov r8,rdx adc r8,0 mul rbx add r9,rax - mov rax,QWORD[32+rbp] + mov rax,QWORD[16+rbp] adc rdx,0 add r8,r9 lea rdi,[8+rdi] @@ -1542,7 +1795,7 @@ $L$8x_tail: mul rbx add r10,rax - mov rax,QWORD[48+rbp] + mov rax,QWORD[24+rbp] adc rdx,0 add r9,r10 mov r10,rdx @@ -1550,7 +1803,7 @@ $L$8x_tail: mul rbx add r11,rax - mov rax,QWORD[64+rbp] + mov rax,QWORD[32+rbp] adc rdx,0 add r10,r11 mov r11,rdx @@ -1558,7 +1811,7 @@ $L$8x_tail: mul rbx add r12,rax - mov rax,QWORD[80+rbp] + mov rax,QWORD[40+rbp] adc rdx,0 add r11,r12 mov r12,rdx @@ -1566,7 +1819,7 @@ $L$8x_tail: mul rbx add r13,rax - mov rax,QWORD[96+rbp] + mov rax,QWORD[48+rbp] adc rdx,0 add r12,r13 mov r13,rdx @@ -1574,7 +1827,7 @@ $L$8x_tail: mul rbx add r14,rax - mov rax,QWORD[112+rbp] + mov rax,QWORD[56+rbp] adc rdx,0 add r13,r14 mov r14,rdx @@ -1592,7 +1845,7 @@ $L$8x_tail: dec ecx jnz NEAR $L$8x_tail - lea rbp,[128+rbp] + lea rbp,[64+rbp] mov rdx,QWORD[((8+8))+rsp] cmp rbp,QWORD[((0+8))+rsp] jae NEAR $L$8x_tail_done @@ -1616,6 +1869,15 @@ $L$8x_tail: ALIGN 32 $L$8x_tail_done: add r8,QWORD[rdx] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + + xor rax,rax neg rsi @@ -1629,7 +1891,7 @@ $L$8x_no_tail: adc r14,QWORD[48+rdi] adc r15,QWORD[56+rdi] adc rax,0 - mov rcx,QWORD[((-16))+rbp] + mov rcx,QWORD[((-8))+rbp] xor rsi,rsi DB 102,72,15,126,213 @@ -1647,40 +1909,58 @@ DB 102,73,15,126,217 cmp rdi,rdx jb NEAR $L$8x_reduction_loop + DB 0F3h,0C3h ;repret - sub rcx,r15 - lea rbx,[r9*1+rdi] - adc rsi,rsi - mov rcx,r9 - or rax,rsi -DB 102,72,15,126,207 - xor rax,1 -DB 102,72,15,126,206 - lea rbp,[rax*8+rbp] - sar rcx,3+2 - jmp NEAR $L$sqr4x_sub ALIGN 32 +__bn_post4x_internal: + mov r12,QWORD[rbp] + lea rbx,[r9*1+rdi] + mov rcx,r9 +DB 102,72,15,126,207 + neg rax +DB 102,72,15,126,206 + sar rcx,3+2 + dec r12 + xor r10,r10 + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqr4x_sub_entry + +ALIGN 16 $L$sqr4x_sub: -DB 0x66 - mov r12,QWORD[rbx] - mov r13,QWORD[8+rbx] - sbb r12,QWORD[rbp] - mov r14,QWORD[16+rbx] - sbb r13,QWORD[16+rbp] - mov r15,QWORD[24+rbx] - lea rbx,[32+rbx] - sbb r14,QWORD[32+rbp] + mov r12,QWORD[rbp] + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] +$L$sqr4x_sub_entry: + lea rbp,[32+rbp] + not r12 + not r13 + not r14 + not r15 + and r12,rax + and r13,rax + and r14,rax + and r15,rax + + neg r10 + adc r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + adc r14,QWORD[16+rbx] + adc r15,QWORD[24+rbx] mov QWORD[rdi],r12 - sbb r15,QWORD[48+rbp] - lea rbp,[64+rbp] + lea rbx,[32+rbx] mov QWORD[8+rdi],r13 + sbb r10,r10 mov QWORD[16+rdi],r14 mov QWORD[24+rdi],r15 lea rdi,[32+rdi] inc rcx jnz NEAR $L$sqr4x_sub + mov r10,r9 neg r9 DB 0F3h,0C3h ;repret @@ -1718,13 +1998,9 @@ DB 0x67 push r13 push r14 push r15 - lea rsp,[((-40))+rsp] - movaps XMMWORD[rsp],xmm6 - movaps XMMWORD[16+rsp],xmm7 -DB 0x67 - mov r10d,r9d + shl r9d,3 - shl r10d,3+2 + lea r10,[r9*2+r9] neg r9 mov r8,QWORD[r8] @@ -1734,19 +2010,20 @@ DB 0x67 - lea r11,[((-64))+r9*2+rsp] - sub r11,rsi + + lea r11,[((-320))+r9*2+rsp] + sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$from_sp_alt sub rsp,r11 - lea rsp,[((-64))+r9*2+rsp] + lea rsp,[((-320))+r9*2+rsp] jmp NEAR $L$from_sp_done ALIGN 32 $L$from_sp_alt: - lea r10,[((4096-64))+r9*2] - lea rsp,[((-64))+r9*2+rsp] + lea r10,[((4096-320))+r9*2] + lea rsp,[((-320))+r9*2+rsp] sub r11,r10 mov r10,0 cmovc r11,r10 @@ -1797,7 +2074,8 @@ DB 102,72,15,110,209 DB 0x67 mov rbp,rcx DB 102,73,15,110,218 - call sqr8x_reduction + call __bn_sqr8x_reduction + call __bn_post4x_internal pxor xmm0,xmm0 lea rax,[48+rsp] @@ -1847,55 +2125,171 @@ $L$scatter_epilogue: global bn_gather5 -ALIGN 16 +ALIGN 32 bn_gather5: $L$SEH_begin_bn_gather5: -DB 0x48,0x83,0xec,0x28 -DB 0x0f,0x29,0x34,0x24 -DB 0x0f,0x29,0x7c,0x24,0x10 - mov r11d,r9d - shr r9d,3 - and r11,7 - not r9d - lea rax,[$L$magic_masks] - and r9d,3 - lea r8,[128+r11*8+r8] - movq xmm4,QWORD[r9*8+rax] - movq xmm5,QWORD[8+r9*8+rax] - movq xmm6,QWORD[16+r9*8+rax] - movq xmm7,QWORD[24+r9*8+rax] - jmp NEAR $L$gather -ALIGN 16 -$L$gather: - movq xmm0,QWORD[(((-128)))+r8] - movq xmm1,QWORD[((-64))+r8] - pand xmm0,xmm4 - movq xmm2,QWORD[r8] - pand xmm1,xmm5 - movq xmm3,QWORD[64+r8] - pand xmm2,xmm6 - por xmm0,xmm1 - pand xmm3,xmm7 -DB 0x67,0x67 - por xmm0,xmm2 - lea r8,[256+r8] - por xmm0,xmm3 +DB 0x4c,0x8d,0x14,0x24 +DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + lea rax,[$L$inc] + and rsp,-16 + movd xmm5,r9d + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r11,[128+r8] + lea rax,[128+rsp] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[(-128)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[(-112)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[(-96)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[(-80)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[(-64)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[(-48)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[(-32)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[(-16)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[16+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[32+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[48+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[64+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[80+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[96+rax],xmm2 + movdqa xmm2,xmm4 + movdqa XMMWORD[112+rax],xmm3 + jmp NEAR $L$gather + +ALIGN 32 +$L$gather: + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r11] + movdqa xmm1,XMMWORD[((-112))+r11] + movdqa xmm2,XMMWORD[((-96))+r11] + pand xmm0,XMMWORD[((-128))+rax] + movdqa xmm3,XMMWORD[((-80))+r11] + pand xmm1,XMMWORD[((-112))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r11] + movdqa xmm1,XMMWORD[((-48))+r11] + movdqa xmm2,XMMWORD[((-32))+r11] + pand xmm0,XMMWORD[((-64))+rax] + movdqa xmm3,XMMWORD[((-16))+r11] + pand xmm1,XMMWORD[((-48))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r11] + movdqa xmm1,XMMWORD[16+r11] + movdqa xmm2,XMMWORD[32+r11] + pand xmm0,XMMWORD[rax] + movdqa xmm3,XMMWORD[48+r11] + pand xmm1,XMMWORD[16+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r11] + movdqa xmm1,XMMWORD[80+r11] + movdqa xmm2,XMMWORD[96+r11] + pand xmm0,XMMWORD[64+rax] + movdqa xmm3,XMMWORD[112+r11] + pand xmm1,XMMWORD[80+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rax] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + lea r11,[256+r11] + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 movq QWORD[rcx],xmm0 lea rcx,[8+rcx] sub edx,1 jnz NEAR $L$gather - movaps xmm6,XMMWORD[rsp] - movaps xmm7,XMMWORD[16+rsp] - lea rsp,[40+rsp] + + lea rsp,[r10] DB 0F3h,0C3h ;repret $L$SEH_end_bn_gather5: ALIGN 64 -$L$magic_masks: - DD 0,0,0,0,0,0,-1,-1 - DD 0,0,0,0,0,0,0,0 +$L$inc: + DD 0,0,1,1 + DD 2,2,2,2 DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115 DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 @@ -1937,19 +2331,16 @@ mul_handler: lea r10,[$L$mul_epilogue] cmp rbx,r10 - jb NEAR $L$body_40 + ja NEAR $L$body_40 mov r10,QWORD[192+r8] mov rax,QWORD[8+r10*8+rax] + jmp NEAR $L$body_proceed $L$body_40: mov rax,QWORD[40+rax] $L$body_proceed: - - movaps xmm0,XMMWORD[((-88))+rax] - movaps xmm1,XMMWORD[((-72))+rax] - mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] @@ -1962,8 +2353,6 @@ $L$body_proceed: mov QWORD[224+r8],r13 mov QWORD[232+r8],r14 mov QWORD[240+r8],r15 - movups XMMWORD[512+r8],xmm0 - movups XMMWORD[528+r8],xmm1 $L$common_seh_tail: mov rdi,QWORD[8+rax] @@ -2049,8 +2438,7 @@ DB 9,0,0,0 DD $L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_gather5: -DB 0x01,0x0d,0x05,0x00 -DB 0x0d,0x78,0x01,0x00 -DB 0x08,0x68,0x00,0x00 -DB 0x04,0x42,0x00,0x00 +DB 0x01,0x0b,0x03,0x0a +DB 0x0b,0x01,0x21,0x00 +DB 0x04,0xa3,0x00,0x00 ALIGN 8 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm new file mode 100644 index 0000000000..0ecbe95682 --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm @@ -0,0 +1,1689 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +EXTERN OPENSSL_ia32cap_P + +ALIGN 64 +$L$zero: + DD 0,0,0,0 +$L$one: + DD 1,0,0,0 +$L$inc: + DD 0,1,2,3 +$L$four: + DD 4,4,4,4 +$L$incy: + DD 0,2,4,6,1,3,5,7 +$L$eight: + DD 8,8,8,8,8,8,8,8 +$L$rot16: +DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd +$L$rot24: +DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe +$L$sigma: +DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 +DB 0 +DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 +DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 +DB 108,46,111,114,103,62,0 +global ChaCha20_ctr32 + +ALIGN 64 +ChaCha20_ctr32: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ctr32: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + cmp rdx,0 + je NEAR $L$no_data + mov r10,QWORD[((OPENSSL_ia32cap_P+4))] + test r10d,512 + jnz NEAR $L$ChaCha20_ssse3 + + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + sub rsp,64+24 + + + movdqu xmm1,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm4,XMMWORD[$L$one] + + + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + mov rbp,rdx + jmp NEAR $L$oop_outer + +ALIGN 32 +$L$oop_outer: + mov eax,0x61707865 + mov ebx,0x3320646e + mov ecx,0x79622d32 + mov edx,0x6b206574 + mov r8d,DWORD[16+rsp] + mov r9d,DWORD[20+rsp] + mov r10d,DWORD[24+rsp] + mov r11d,DWORD[28+rsp] + movd r12d,xmm3 + mov r13d,DWORD[52+rsp] + mov r14d,DWORD[56+rsp] + mov r15d,DWORD[60+rsp] + + mov QWORD[((64+0))+rsp],rbp + mov ebp,10 + mov QWORD[((64+8))+rsp],rsi +DB 102,72,15,126,214 + mov QWORD[((64+16))+rsp],rdi + mov rdi,rsi + shr rdi,32 + jmp NEAR $L$oop + +ALIGN 32 +$L$oop: + add eax,r8d + xor r12d,eax + rol r12d,16 + add ebx,r9d + xor r13d,ebx + rol r13d,16 + add esi,r12d + xor r8d,esi + rol r8d,12 + add edi,r13d + xor r9d,edi + rol r9d,12 + add eax,r8d + xor r12d,eax + rol r12d,8 + add ebx,r9d + xor r13d,ebx + rol r13d,8 + add esi,r12d + xor r8d,esi + rol r8d,7 + add edi,r13d + xor r9d,edi + rol r9d,7 + mov DWORD[32+rsp],esi + mov DWORD[36+rsp],edi + mov esi,DWORD[40+rsp] + mov edi,DWORD[44+rsp] + add ecx,r10d + xor r14d,ecx + rol r14d,16 + add edx,r11d + xor r15d,edx + rol r15d,16 + add esi,r14d + xor r10d,esi + rol r10d,12 + add edi,r15d + xor r11d,edi + rol r11d,12 + add ecx,r10d + xor r14d,ecx + rol r14d,8 + add edx,r11d + xor r15d,edx + rol r15d,8 + add esi,r14d + xor r10d,esi + rol r10d,7 + add edi,r15d + xor r11d,edi + rol r11d,7 + add eax,r9d + xor r15d,eax + rol r15d,16 + add ebx,r10d + xor r12d,ebx + rol r12d,16 + add esi,r15d + xor r9d,esi + rol r9d,12 + add edi,r12d + xor r10d,edi + rol r10d,12 + add eax,r9d + xor r15d,eax + rol r15d,8 + add ebx,r10d + xor r12d,ebx + rol r12d,8 + add esi,r15d + xor r9d,esi + rol r9d,7 + add edi,r12d + xor r10d,edi + rol r10d,7 + mov DWORD[40+rsp],esi + mov DWORD[44+rsp],edi + mov esi,DWORD[32+rsp] + mov edi,DWORD[36+rsp] + add ecx,r11d + xor r13d,ecx + rol r13d,16 + add edx,r8d + xor r14d,edx + rol r14d,16 + add esi,r13d + xor r11d,esi + rol r11d,12 + add edi,r14d + xor r8d,edi + rol r8d,12 + add ecx,r11d + xor r13d,ecx + rol r13d,8 + add edx,r8d + xor r14d,edx + rol r14d,8 + add esi,r13d + xor r11d,esi + rol r11d,7 + add edi,r14d + xor r8d,edi + rol r8d,7 + dec ebp + jnz NEAR $L$oop + mov DWORD[36+rsp],edi + mov DWORD[32+rsp],esi + mov rbp,QWORD[64+rsp] + movdqa xmm1,xmm2 + mov rsi,QWORD[((64+8))+rsp] + paddd xmm3,xmm4 + mov rdi,QWORD[((64+16))+rsp] + + add eax,0x61707865 + add ebx,0x3320646e + add ecx,0x79622d32 + add edx,0x6b206574 + add r8d,DWORD[16+rsp] + add r9d,DWORD[20+rsp] + add r10d,DWORD[24+rsp] + add r11d,DWORD[28+rsp] + add r12d,DWORD[48+rsp] + add r13d,DWORD[52+rsp] + add r14d,DWORD[56+rsp] + add r15d,DWORD[60+rsp] + paddd xmm1,XMMWORD[32+rsp] + + cmp rbp,64 + jb NEAR $L$tail + + xor eax,DWORD[rsi] + xor ebx,DWORD[4+rsi] + xor ecx,DWORD[8+rsi] + xor edx,DWORD[12+rsi] + xor r8d,DWORD[16+rsi] + xor r9d,DWORD[20+rsi] + xor r10d,DWORD[24+rsi] + xor r11d,DWORD[28+rsi] + movdqu xmm0,XMMWORD[32+rsi] + xor r12d,DWORD[48+rsi] + xor r13d,DWORD[52+rsi] + xor r14d,DWORD[56+rsi] + xor r15d,DWORD[60+rsi] + lea rsi,[64+rsi] + pxor xmm0,xmm1 + + movdqa XMMWORD[32+rsp],xmm2 + movd DWORD[48+rsp],xmm3 + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + movdqu XMMWORD[32+rdi],xmm0 + mov DWORD[48+rdi],r12d + mov DWORD[52+rdi],r13d + mov DWORD[56+rdi],r14d + mov DWORD[60+rdi],r15d + lea rdi,[64+rdi] + + sub rbp,64 + jnz NEAR $L$oop_outer + + jmp NEAR $L$done + +ALIGN 16 +$L$tail: + mov DWORD[rsp],eax + mov DWORD[4+rsp],ebx + xor rbx,rbx + mov DWORD[8+rsp],ecx + mov DWORD[12+rsp],edx + mov DWORD[16+rsp],r8d + mov DWORD[20+rsp],r9d + mov DWORD[24+rsp],r10d + mov DWORD[28+rsp],r11d + movdqa XMMWORD[32+rsp],xmm1 + mov DWORD[48+rsp],r12d + mov DWORD[52+rsp],r13d + mov DWORD[56+rsp],r14d + mov DWORD[60+rsp],r15d + +$L$oop_tail: + movzx eax,BYTE[rbx*1+rsi] + movzx edx,BYTE[rbx*1+rsp] + lea rbx,[1+rbx] + xor eax,edx + mov BYTE[((-1))+rbx*1+rdi],al + dec rbp + jnz NEAR $L$oop_tail + +$L$done: + add rsp,64+24 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx +$L$no_data: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ChaCha20_ctr32: + +ALIGN 32 +ChaCha20_ssse3: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ssse3: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + +$L$ChaCha20_ssse3: + cmp rdx,128 + ja NEAR $L$ChaCha20_4x + +$L$do_sse3_after_all: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + + sub rsp,64+72 + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movdqa xmm0,XMMWORD[$L$sigma] + movdqu xmm1,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm6,XMMWORD[$L$rot16] + movdqa xmm7,XMMWORD[$L$rot24] + + movdqa XMMWORD[rsp],xmm0 + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + mov ebp,10 + jmp NEAR $L$oop_ssse3 + +ALIGN 32 +$L$oop_outer_ssse3: + movdqa xmm3,XMMWORD[$L$one] + movdqa xmm0,XMMWORD[rsp] + movdqa xmm1,XMMWORD[16+rsp] + movdqa xmm2,XMMWORD[32+rsp] + paddd xmm3,XMMWORD[48+rsp] + mov ebp,10 + movdqa XMMWORD[48+rsp],xmm3 + jmp NEAR $L$oop_ssse3 + +ALIGN 32 +$L$oop_ssse3: + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,57 + pshufd xmm3,xmm3,147 + nop + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,222 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 +DB 102,15,56,0,223 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,147 + pshufd xmm3,xmm3,57 + dec ebp + jnz NEAR $L$oop_ssse3 + paddd xmm0,XMMWORD[rsp] + paddd xmm1,XMMWORD[16+rsp] + paddd xmm2,XMMWORD[32+rsp] + paddd xmm3,XMMWORD[48+rsp] + + cmp rdx,64 + jb NEAR $L$tail_ssse3 + + movdqu xmm4,XMMWORD[rsi] + movdqu xmm5,XMMWORD[16+rsi] + pxor xmm0,xmm4 + movdqu xmm4,XMMWORD[32+rsi] + pxor xmm1,xmm5 + movdqu xmm5,XMMWORD[48+rsi] + lea rsi,[64+rsi] + pxor xmm2,xmm4 + pxor xmm3,xmm5 + + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + lea rdi,[64+rdi] + + sub rdx,64 + jnz NEAR $L$oop_outer_ssse3 + + jmp NEAR $L$done_ssse3 + +ALIGN 16 +$L$tail_ssse3: + movdqa XMMWORD[rsp],xmm0 + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + xor rbx,rbx + +$L$oop_tail_ssse3: + movzx eax,BYTE[rbx*1+rsi] + movzx ecx,BYTE[rbx*1+rsp] + lea rbx,[1+rbx] + xor eax,ecx + mov BYTE[((-1))+rbx*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail_ssse3 + +$L$done_ssse3: + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + add rsp,64+72 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ChaCha20_ssse3: + +ALIGN 32 +ChaCha20_4x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_4x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + +$L$ChaCha20_4x: + mov r11,r10 + shr r10,32 + test r10,32 + jnz NEAR $L$ChaCha20_8x + cmp rdx,192 + ja NEAR $L$proceed4x + + and r11,71303168 + cmp r11,4194304 + je NEAR $L$do_sse3_after_all + +$L$proceed4x: + lea r11,[((-120))+rsp] + sub rsp,0x148+160 + movaps XMMWORD[(-48)+r11],xmm6 + movaps XMMWORD[(-32)+r11],xmm7 + movaps XMMWORD[(-16)+r11],xmm8 + movaps XMMWORD[r11],xmm9 + movaps XMMWORD[16+r11],xmm10 + movaps XMMWORD[32+r11],xmm11 + movaps XMMWORD[48+r11],xmm12 + movaps XMMWORD[64+r11],xmm13 + movaps XMMWORD[80+r11],xmm14 + movaps XMMWORD[96+r11],xmm15 + movdqa xmm11,XMMWORD[$L$sigma] + movdqu xmm15,XMMWORD[rcx] + movdqu xmm7,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + lea rcx,[256+rsp] + lea r10,[$L$rot16] + lea r11,[$L$rot24] + + pshufd xmm8,xmm11,0x00 + pshufd xmm9,xmm11,0x55 + movdqa XMMWORD[64+rsp],xmm8 + pshufd xmm10,xmm11,0xaa + movdqa XMMWORD[80+rsp],xmm9 + pshufd xmm11,xmm11,0xff + movdqa XMMWORD[96+rsp],xmm10 + movdqa XMMWORD[112+rsp],xmm11 + + pshufd xmm12,xmm15,0x00 + pshufd xmm13,xmm15,0x55 + movdqa XMMWORD[(128-256)+rcx],xmm12 + pshufd xmm14,xmm15,0xaa + movdqa XMMWORD[(144-256)+rcx],xmm13 + pshufd xmm15,xmm15,0xff + movdqa XMMWORD[(160-256)+rcx],xmm14 + movdqa XMMWORD[(176-256)+rcx],xmm15 + + pshufd xmm4,xmm7,0x00 + pshufd xmm5,xmm7,0x55 + movdqa XMMWORD[(192-256)+rcx],xmm4 + pshufd xmm6,xmm7,0xaa + movdqa XMMWORD[(208-256)+rcx],xmm5 + pshufd xmm7,xmm7,0xff + movdqa XMMWORD[(224-256)+rcx],xmm6 + movdqa XMMWORD[(240-256)+rcx],xmm7 + + pshufd xmm0,xmm3,0x00 + pshufd xmm1,xmm3,0x55 + paddd xmm0,XMMWORD[$L$inc] + pshufd xmm2,xmm3,0xaa + movdqa XMMWORD[(272-256)+rcx],xmm1 + pshufd xmm3,xmm3,0xff + movdqa XMMWORD[(288-256)+rcx],xmm2 + movdqa XMMWORD[(304-256)+rcx],xmm3 + + jmp NEAR $L$oop_enter4x + +ALIGN 32 +$L$oop_outer4x: + movdqa xmm8,XMMWORD[64+rsp] + movdqa xmm9,XMMWORD[80+rsp] + movdqa xmm10,XMMWORD[96+rsp] + movdqa xmm11,XMMWORD[112+rsp] + movdqa xmm12,XMMWORD[((128-256))+rcx] + movdqa xmm13,XMMWORD[((144-256))+rcx] + movdqa xmm14,XMMWORD[((160-256))+rcx] + movdqa xmm15,XMMWORD[((176-256))+rcx] + movdqa xmm4,XMMWORD[((192-256))+rcx] + movdqa xmm5,XMMWORD[((208-256))+rcx] + movdqa xmm6,XMMWORD[((224-256))+rcx] + movdqa xmm7,XMMWORD[((240-256))+rcx] + movdqa xmm0,XMMWORD[((256-256))+rcx] + movdqa xmm1,XMMWORD[((272-256))+rcx] + movdqa xmm2,XMMWORD[((288-256))+rcx] + movdqa xmm3,XMMWORD[((304-256))+rcx] + paddd xmm0,XMMWORD[$L$four] + +$L$oop_enter4x: + movdqa XMMWORD[32+rsp],xmm6 + movdqa XMMWORD[48+rsp],xmm7 + movdqa xmm7,XMMWORD[r10] + mov eax,10 + movdqa XMMWORD[(256-256)+rcx],xmm0 + jmp NEAR $L$oop4x + +ALIGN 32 +$L$oop4x: + paddd xmm8,xmm12 + paddd xmm9,xmm13 + pxor xmm0,xmm8 + pxor xmm1,xmm9 +DB 102,15,56,0,199 +DB 102,15,56,0,207 + paddd xmm4,xmm0 + paddd xmm5,xmm1 + pxor xmm12,xmm4 + pxor xmm13,xmm5 + movdqa xmm6,xmm12 + pslld xmm12,12 + psrld xmm6,20 + movdqa xmm7,xmm13 + pslld xmm13,12 + por xmm12,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm13,xmm7 + paddd xmm8,xmm12 + paddd xmm9,xmm13 + pxor xmm0,xmm8 + pxor xmm1,xmm9 +DB 102,15,56,0,198 +DB 102,15,56,0,206 + paddd xmm4,xmm0 + paddd xmm5,xmm1 + pxor xmm12,xmm4 + pxor xmm13,xmm5 + movdqa xmm7,xmm12 + pslld xmm12,7 + psrld xmm7,25 + movdqa xmm6,xmm13 + pslld xmm13,7 + por xmm12,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm13,xmm6 + movdqa XMMWORD[rsp],xmm4 + movdqa XMMWORD[16+rsp],xmm5 + movdqa xmm4,XMMWORD[32+rsp] + movdqa xmm5,XMMWORD[48+rsp] + paddd xmm10,xmm14 + paddd xmm11,xmm15 + pxor xmm2,xmm10 + pxor xmm3,xmm11 +DB 102,15,56,0,215 +DB 102,15,56,0,223 + paddd xmm4,xmm2 + paddd xmm5,xmm3 + pxor xmm14,xmm4 + pxor xmm15,xmm5 + movdqa xmm6,xmm14 + pslld xmm14,12 + psrld xmm6,20 + movdqa xmm7,xmm15 + pslld xmm15,12 + por xmm14,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm15,xmm7 + paddd xmm10,xmm14 + paddd xmm11,xmm15 + pxor xmm2,xmm10 + pxor xmm3,xmm11 +DB 102,15,56,0,214 +DB 102,15,56,0,222 + paddd xmm4,xmm2 + paddd xmm5,xmm3 + pxor xmm14,xmm4 + pxor xmm15,xmm5 + movdqa xmm7,xmm14 + pslld xmm14,7 + psrld xmm7,25 + movdqa xmm6,xmm15 + pslld xmm15,7 + por xmm14,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm15,xmm6 + paddd xmm8,xmm13 + paddd xmm9,xmm14 + pxor xmm3,xmm8 + pxor xmm0,xmm9 +DB 102,15,56,0,223 +DB 102,15,56,0,199 + paddd xmm4,xmm3 + paddd xmm5,xmm0 + pxor xmm13,xmm4 + pxor xmm14,xmm5 + movdqa xmm6,xmm13 + pslld xmm13,12 + psrld xmm6,20 + movdqa xmm7,xmm14 + pslld xmm14,12 + por xmm13,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm14,xmm7 + paddd xmm8,xmm13 + paddd xmm9,xmm14 + pxor xmm3,xmm8 + pxor xmm0,xmm9 +DB 102,15,56,0,222 +DB 102,15,56,0,198 + paddd xmm4,xmm3 + paddd xmm5,xmm0 + pxor xmm13,xmm4 + pxor xmm14,xmm5 + movdqa xmm7,xmm13 + pslld xmm13,7 + psrld xmm7,25 + movdqa xmm6,xmm14 + pslld xmm14,7 + por xmm13,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm14,xmm6 + movdqa XMMWORD[32+rsp],xmm4 + movdqa XMMWORD[48+rsp],xmm5 + movdqa xmm4,XMMWORD[rsp] + movdqa xmm5,XMMWORD[16+rsp] + paddd xmm10,xmm15 + paddd xmm11,xmm12 + pxor xmm1,xmm10 + pxor xmm2,xmm11 +DB 102,15,56,0,207 +DB 102,15,56,0,215 + paddd xmm4,xmm1 + paddd xmm5,xmm2 + pxor xmm15,xmm4 + pxor xmm12,xmm5 + movdqa xmm6,xmm15 + pslld xmm15,12 + psrld xmm6,20 + movdqa xmm7,xmm12 + pslld xmm12,12 + por xmm15,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm12,xmm7 + paddd xmm10,xmm15 + paddd xmm11,xmm12 + pxor xmm1,xmm10 + pxor xmm2,xmm11 +DB 102,15,56,0,206 +DB 102,15,56,0,214 + paddd xmm4,xmm1 + paddd xmm5,xmm2 + pxor xmm15,xmm4 + pxor xmm12,xmm5 + movdqa xmm7,xmm15 + pslld xmm15,7 + psrld xmm7,25 + movdqa xmm6,xmm12 + pslld xmm12,7 + por xmm15,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm12,xmm6 + dec eax + jnz NEAR $L$oop4x + + paddd xmm8,XMMWORD[64+rsp] + paddd xmm9,XMMWORD[80+rsp] + paddd xmm10,XMMWORD[96+rsp] + paddd xmm11,XMMWORD[112+rsp] + + movdqa xmm6,xmm8 + punpckldq xmm8,xmm9 + movdqa xmm7,xmm10 + punpckldq xmm10,xmm11 + punpckhdq xmm6,xmm9 + punpckhdq xmm7,xmm11 + movdqa xmm9,xmm8 + punpcklqdq xmm8,xmm10 + movdqa xmm11,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm9,xmm10 + punpckhqdq xmm11,xmm7 + paddd xmm12,XMMWORD[((128-256))+rcx] + paddd xmm13,XMMWORD[((144-256))+rcx] + paddd xmm14,XMMWORD[((160-256))+rcx] + paddd xmm15,XMMWORD[((176-256))+rcx] + + movdqa XMMWORD[rsp],xmm8 + movdqa XMMWORD[16+rsp],xmm9 + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + + movdqa xmm10,xmm12 + punpckldq xmm12,xmm13 + movdqa xmm7,xmm14 + punpckldq xmm14,xmm15 + punpckhdq xmm10,xmm13 + punpckhdq xmm7,xmm15 + movdqa xmm13,xmm12 + punpcklqdq xmm12,xmm14 + movdqa xmm15,xmm10 + punpcklqdq xmm10,xmm7 + punpckhqdq xmm13,xmm14 + punpckhqdq xmm15,xmm7 + paddd xmm4,XMMWORD[((192-256))+rcx] + paddd xmm5,XMMWORD[((208-256))+rcx] + paddd xmm8,XMMWORD[((224-256))+rcx] + paddd xmm9,XMMWORD[((240-256))+rcx] + + movdqa XMMWORD[32+rsp],xmm6 + movdqa XMMWORD[48+rsp],xmm11 + + movdqa xmm14,xmm4 + punpckldq xmm4,xmm5 + movdqa xmm7,xmm8 + punpckldq xmm8,xmm9 + punpckhdq xmm14,xmm5 + punpckhdq xmm7,xmm9 + movdqa xmm5,xmm4 + punpcklqdq xmm4,xmm8 + movdqa xmm9,xmm14 + punpcklqdq xmm14,xmm7 + punpckhqdq xmm5,xmm8 + punpckhqdq xmm9,xmm7 + paddd xmm0,XMMWORD[((256-256))+rcx] + paddd xmm1,XMMWORD[((272-256))+rcx] + paddd xmm2,XMMWORD[((288-256))+rcx] + paddd xmm3,XMMWORD[((304-256))+rcx] + + movdqa xmm8,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm8,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm8 + punpcklqdq xmm8,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + cmp rdx,64*4 + jb NEAR $L$tail4x + + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + + movdqu XMMWORD[64+rdi],xmm6 + movdqu xmm6,XMMWORD[rsi] + movdqu XMMWORD[80+rdi],xmm11 + movdqu xmm11,XMMWORD[16+rsi] + movdqu XMMWORD[96+rdi],xmm2 + movdqu xmm2,XMMWORD[32+rsi] + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[32+rsp] + pxor xmm11,xmm10 + pxor xmm2,xmm14 + pxor xmm7,xmm8 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[48+rsp] + pxor xmm11,xmm15 + pxor xmm2,xmm9 + pxor xmm7,xmm3 + movdqu XMMWORD[64+rdi],xmm6 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm2 + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + + sub rdx,64*4 + jnz NEAR $L$oop_outer4x + + jmp NEAR $L$done4x + +$L$tail4x: + cmp rdx,192 + jae NEAR $L$192_or_more4x + cmp rdx,128 + jae NEAR $L$128_or_more4x + cmp rdx,64 + jae NEAR $L$64_or_more4x + + + xor r10,r10 + + movdqa XMMWORD[16+rsp],xmm12 + movdqa XMMWORD[32+rsp],xmm4 + movdqa XMMWORD[48+rsp],xmm0 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$64_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + movdqu XMMWORD[rdi],xmm6 + movdqu XMMWORD[16+rdi],xmm11 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[16+rsp] + lea rsi,[64+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm13 + lea rdi,[64+rdi] + movdqa XMMWORD[32+rsp],xmm5 + sub rdx,64 + movdqa XMMWORD[48+rsp],xmm1 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$128_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + movdqu XMMWORD[64+rdi],xmm6 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm2 + movdqu XMMWORD[112+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[32+rsp] + lea rsi,[128+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm10 + lea rdi,[128+rdi] + movdqa XMMWORD[32+rsp],xmm14 + sub rdx,128 + movdqa XMMWORD[48+rsp],xmm8 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$192_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + + movdqu XMMWORD[64+rdi],xmm6 + movdqu xmm6,XMMWORD[rsi] + movdqu XMMWORD[80+rdi],xmm11 + movdqu xmm11,XMMWORD[16+rsi] + movdqu XMMWORD[96+rdi],xmm2 + movdqu xmm2,XMMWORD[32+rsi] + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[32+rsp] + pxor xmm11,xmm10 + pxor xmm2,xmm14 + pxor xmm7,xmm8 + movdqu XMMWORD[rdi],xmm6 + movdqu XMMWORD[16+rdi],xmm11 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[48+rsp] + lea rsi,[64+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm15 + lea rdi,[64+rdi] + movdqa XMMWORD[32+rsp],xmm9 + sub rdx,192 + movdqa XMMWORD[48+rsp],xmm3 + +$L$oop_tail4x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail4x + +$L$done4x: + lea r11,[((320+48))+rsp] + movaps xmm6,XMMWORD[((-48))+r11] + movaps xmm7,XMMWORD[((-32))+r11] + movaps xmm8,XMMWORD[((-16))+r11] + movaps xmm9,XMMWORD[r11] + movaps xmm10,XMMWORD[16+r11] + movaps xmm11,XMMWORD[32+r11] + movaps xmm12,XMMWORD[48+r11] + movaps xmm13,XMMWORD[64+r11] + movaps xmm14,XMMWORD[80+r11] + movaps xmm15,XMMWORD[96+r11] + add rsp,0x148+160 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ChaCha20_4x: + +ALIGN 32 +ChaCha20_8x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_8x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + +$L$ChaCha20_8x: + mov r10,rsp + sub rsp,0x280+176 + and rsp,-32 + lea r11,[((656+48))+rsp] + movaps XMMWORD[(-48)+r11],xmm6 + movaps XMMWORD[(-32)+r11],xmm7 + movaps XMMWORD[(-16)+r11],xmm8 + movaps XMMWORD[r11],xmm9 + movaps XMMWORD[16+r11],xmm10 + movaps XMMWORD[32+r11],xmm11 + movaps XMMWORD[48+r11],xmm12 + movaps XMMWORD[64+r11],xmm13 + movaps XMMWORD[80+r11],xmm14 + movaps XMMWORD[96+r11],xmm15 + vzeroupper + mov QWORD[640+rsp],r10 + + + + + + + + + + + vbroadcasti128 ymm11,YMMWORD[$L$sigma] + vbroadcasti128 ymm3,YMMWORD[rcx] + vbroadcasti128 ymm15,YMMWORD[16+rcx] + vbroadcasti128 ymm7,YMMWORD[r8] + lea rcx,[256+rsp] + lea rax,[512+rsp] + lea r10,[$L$rot16] + lea r11,[$L$rot24] + + vpshufd ymm8,ymm11,0x00 + vpshufd ymm9,ymm11,0x55 + vmovdqa YMMWORD[(128-256)+rcx],ymm8 + vpshufd ymm10,ymm11,0xaa + vmovdqa YMMWORD[(160-256)+rcx],ymm9 + vpshufd ymm11,ymm11,0xff + vmovdqa YMMWORD[(192-256)+rcx],ymm10 + vmovdqa YMMWORD[(224-256)+rcx],ymm11 + + vpshufd ymm0,ymm3,0x00 + vpshufd ymm1,ymm3,0x55 + vmovdqa YMMWORD[(256-256)+rcx],ymm0 + vpshufd ymm2,ymm3,0xaa + vmovdqa YMMWORD[(288-256)+rcx],ymm1 + vpshufd ymm3,ymm3,0xff + vmovdqa YMMWORD[(320-256)+rcx],ymm2 + vmovdqa YMMWORD[(352-256)+rcx],ymm3 + + vpshufd ymm12,ymm15,0x00 + vpshufd ymm13,ymm15,0x55 + vmovdqa YMMWORD[(384-512)+rax],ymm12 + vpshufd ymm14,ymm15,0xaa + vmovdqa YMMWORD[(416-512)+rax],ymm13 + vpshufd ymm15,ymm15,0xff + vmovdqa YMMWORD[(448-512)+rax],ymm14 + vmovdqa YMMWORD[(480-512)+rax],ymm15 + + vpshufd ymm4,ymm7,0x00 + vpshufd ymm5,ymm7,0x55 + vpaddd ymm4,ymm4,YMMWORD[$L$incy] + vpshufd ymm6,ymm7,0xaa + vmovdqa YMMWORD[(544-512)+rax],ymm5 + vpshufd ymm7,ymm7,0xff + vmovdqa YMMWORD[(576-512)+rax],ymm6 + vmovdqa YMMWORD[(608-512)+rax],ymm7 + + jmp NEAR $L$oop_enter8x + +ALIGN 32 +$L$oop_outer8x: + vmovdqa ymm8,YMMWORD[((128-256))+rcx] + vmovdqa ymm9,YMMWORD[((160-256))+rcx] + vmovdqa ymm10,YMMWORD[((192-256))+rcx] + vmovdqa ymm11,YMMWORD[((224-256))+rcx] + vmovdqa ymm0,YMMWORD[((256-256))+rcx] + vmovdqa ymm1,YMMWORD[((288-256))+rcx] + vmovdqa ymm2,YMMWORD[((320-256))+rcx] + vmovdqa ymm3,YMMWORD[((352-256))+rcx] + vmovdqa ymm12,YMMWORD[((384-512))+rax] + vmovdqa ymm13,YMMWORD[((416-512))+rax] + vmovdqa ymm14,YMMWORD[((448-512))+rax] + vmovdqa ymm15,YMMWORD[((480-512))+rax] + vmovdqa ymm4,YMMWORD[((512-512))+rax] + vmovdqa ymm5,YMMWORD[((544-512))+rax] + vmovdqa ymm6,YMMWORD[((576-512))+rax] + vmovdqa ymm7,YMMWORD[((608-512))+rax] + vpaddd ymm4,ymm4,YMMWORD[$L$eight] + +$L$oop_enter8x: + vmovdqa YMMWORD[64+rsp],ymm14 + vmovdqa YMMWORD[96+rsp],ymm15 + vbroadcasti128 ymm15,YMMWORD[r10] + vmovdqa YMMWORD[(512-512)+rax],ymm4 + mov eax,10 + jmp NEAR $L$oop8x + +ALIGN 32 +$L$oop8x: + vpaddd ymm8,ymm8,ymm0 + vpxor ymm4,ymm8,ymm4 + vpshufb ymm4,ymm4,ymm15 + vpaddd ymm9,ymm9,ymm1 + vpxor ymm5,ymm9,ymm5 + vpshufb ymm5,ymm5,ymm15 + vpaddd ymm12,ymm12,ymm4 + vpxor ymm0,ymm12,ymm0 + vpslld ymm14,ymm0,12 + vpsrld ymm0,ymm0,20 + vpor ymm0,ymm14,ymm0 + vbroadcasti128 ymm14,YMMWORD[r11] + vpaddd ymm13,ymm13,ymm5 + vpxor ymm1,ymm13,ymm1 + vpslld ymm15,ymm1,12 + vpsrld ymm1,ymm1,20 + vpor ymm1,ymm15,ymm1 + vpaddd ymm8,ymm8,ymm0 + vpxor ymm4,ymm8,ymm4 + vpshufb ymm4,ymm4,ymm14 + vpaddd ymm9,ymm9,ymm1 + vpxor ymm5,ymm9,ymm5 + vpshufb ymm5,ymm5,ymm14 + vpaddd ymm12,ymm12,ymm4 + vpxor ymm0,ymm12,ymm0 + vpslld ymm15,ymm0,7 + vpsrld ymm0,ymm0,25 + vpor ymm0,ymm15,ymm0 + vbroadcasti128 ymm15,YMMWORD[r10] + vpaddd ymm13,ymm13,ymm5 + vpxor ymm1,ymm13,ymm1 + vpslld ymm14,ymm1,7 + vpsrld ymm1,ymm1,25 + vpor ymm1,ymm14,ymm1 + vmovdqa YMMWORD[rsp],ymm12 + vmovdqa YMMWORD[32+rsp],ymm13 + vmovdqa ymm12,YMMWORD[64+rsp] + vmovdqa ymm13,YMMWORD[96+rsp] + vpaddd ymm10,ymm10,ymm2 + vpxor ymm6,ymm10,ymm6 + vpshufb ymm6,ymm6,ymm15 + vpaddd ymm11,ymm11,ymm3 + vpxor ymm7,ymm11,ymm7 + vpshufb ymm7,ymm7,ymm15 + vpaddd ymm12,ymm12,ymm6 + vpxor ymm2,ymm12,ymm2 + vpslld ymm14,ymm2,12 + vpsrld ymm2,ymm2,20 + vpor ymm2,ymm14,ymm2 + vbroadcasti128 ymm14,YMMWORD[r11] + vpaddd ymm13,ymm13,ymm7 + vpxor ymm3,ymm13,ymm3 + vpslld ymm15,ymm3,12 + vpsrld ymm3,ymm3,20 + vpor ymm3,ymm15,ymm3 + vpaddd ymm10,ymm10,ymm2 + vpxor ymm6,ymm10,ymm6 + vpshufb ymm6,ymm6,ymm14 + vpaddd ymm11,ymm11,ymm3 + vpxor ymm7,ymm11,ymm7 + vpshufb ymm7,ymm7,ymm14 + vpaddd ymm12,ymm12,ymm6 + vpxor ymm2,ymm12,ymm2 + vpslld ymm15,ymm2,7 + vpsrld ymm2,ymm2,25 + vpor ymm2,ymm15,ymm2 + vbroadcasti128 ymm15,YMMWORD[r10] + vpaddd ymm13,ymm13,ymm7 + vpxor ymm3,ymm13,ymm3 + vpslld ymm14,ymm3,7 + vpsrld ymm3,ymm3,25 + vpor ymm3,ymm14,ymm3 + vpaddd ymm8,ymm8,ymm1 + vpxor ymm7,ymm8,ymm7 + vpshufb ymm7,ymm7,ymm15 + vpaddd ymm9,ymm9,ymm2 + vpxor ymm4,ymm9,ymm4 + vpshufb ymm4,ymm4,ymm15 + vpaddd ymm12,ymm12,ymm7 + vpxor ymm1,ymm12,ymm1 + vpslld ymm14,ymm1,12 + vpsrld ymm1,ymm1,20 + vpor ymm1,ymm14,ymm1 + vbroadcasti128 ymm14,YMMWORD[r11] + vpaddd ymm13,ymm13,ymm4 + vpxor ymm2,ymm13,ymm2 + vpslld ymm15,ymm2,12 + vpsrld ymm2,ymm2,20 + vpor ymm2,ymm15,ymm2 + vpaddd ymm8,ymm8,ymm1 + vpxor ymm7,ymm8,ymm7 + vpshufb ymm7,ymm7,ymm14 + vpaddd ymm9,ymm9,ymm2 + vpxor ymm4,ymm9,ymm4 + vpshufb ymm4,ymm4,ymm14 + vpaddd ymm12,ymm12,ymm7 + vpxor ymm1,ymm12,ymm1 + vpslld ymm15,ymm1,7 + vpsrld ymm1,ymm1,25 + vpor ymm1,ymm15,ymm1 + vbroadcasti128 ymm15,YMMWORD[r10] + vpaddd ymm13,ymm13,ymm4 + vpxor ymm2,ymm13,ymm2 + vpslld ymm14,ymm2,7 + vpsrld ymm2,ymm2,25 + vpor ymm2,ymm14,ymm2 + vmovdqa YMMWORD[64+rsp],ymm12 + vmovdqa YMMWORD[96+rsp],ymm13 + vmovdqa ymm12,YMMWORD[rsp] + vmovdqa ymm13,YMMWORD[32+rsp] + vpaddd ymm10,ymm10,ymm3 + vpxor ymm5,ymm10,ymm5 + vpshufb ymm5,ymm5,ymm15 + vpaddd ymm11,ymm11,ymm0 + vpxor ymm6,ymm11,ymm6 + vpshufb ymm6,ymm6,ymm15 + vpaddd ymm12,ymm12,ymm5 + vpxor ymm3,ymm12,ymm3 + vpslld ymm14,ymm3,12 + vpsrld ymm3,ymm3,20 + vpor ymm3,ymm14,ymm3 + vbroadcasti128 ymm14,YMMWORD[r11] + vpaddd ymm13,ymm13,ymm6 + vpxor ymm0,ymm13,ymm0 + vpslld ymm15,ymm0,12 + vpsrld ymm0,ymm0,20 + vpor ymm0,ymm15,ymm0 + vpaddd ymm10,ymm10,ymm3 + vpxor ymm5,ymm10,ymm5 + vpshufb ymm5,ymm5,ymm14 + vpaddd ymm11,ymm11,ymm0 + vpxor ymm6,ymm11,ymm6 + vpshufb ymm6,ymm6,ymm14 + vpaddd ymm12,ymm12,ymm5 + vpxor ymm3,ymm12,ymm3 + vpslld ymm15,ymm3,7 + vpsrld ymm3,ymm3,25 + vpor ymm3,ymm15,ymm3 + vbroadcasti128 ymm15,YMMWORD[r10] + vpaddd ymm13,ymm13,ymm6 + vpxor ymm0,ymm13,ymm0 + vpslld ymm14,ymm0,7 + vpsrld ymm0,ymm0,25 + vpor ymm0,ymm14,ymm0 + dec eax + jnz NEAR $L$oop8x + + lea rax,[512+rsp] + vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] + vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] + vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] + vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] + + vpunpckldq ymm14,ymm8,ymm9 + vpunpckldq ymm15,ymm10,ymm11 + vpunpckhdq ymm8,ymm8,ymm9 + vpunpckhdq ymm10,ymm10,ymm11 + vpunpcklqdq ymm9,ymm14,ymm15 + vpunpckhqdq ymm14,ymm14,ymm15 + vpunpcklqdq ymm11,ymm8,ymm10 + vpunpckhqdq ymm8,ymm8,ymm10 + vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] + vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] + vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] + vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] + + vpunpckldq ymm10,ymm0,ymm1 + vpunpckldq ymm15,ymm2,ymm3 + vpunpckhdq ymm0,ymm0,ymm1 + vpunpckhdq ymm2,ymm2,ymm3 + vpunpcklqdq ymm1,ymm10,ymm15 + vpunpckhqdq ymm10,ymm10,ymm15 + vpunpcklqdq ymm3,ymm0,ymm2 + vpunpckhqdq ymm0,ymm0,ymm2 + vperm2i128 ymm15,ymm9,ymm1,0x20 + vperm2i128 ymm1,ymm9,ymm1,0x31 + vperm2i128 ymm9,ymm14,ymm10,0x20 + vperm2i128 ymm10,ymm14,ymm10,0x31 + vperm2i128 ymm14,ymm11,ymm3,0x20 + vperm2i128 ymm3,ymm11,ymm3,0x31 + vperm2i128 ymm11,ymm8,ymm0,0x20 + vperm2i128 ymm0,ymm8,ymm0,0x31 + vmovdqa YMMWORD[rsp],ymm15 + vmovdqa YMMWORD[32+rsp],ymm9 + vmovdqa ymm15,YMMWORD[64+rsp] + vmovdqa ymm9,YMMWORD[96+rsp] + + vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] + vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] + vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] + vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] + + vpunpckldq ymm2,ymm12,ymm13 + vpunpckldq ymm8,ymm15,ymm9 + vpunpckhdq ymm12,ymm12,ymm13 + vpunpckhdq ymm15,ymm15,ymm9 + vpunpcklqdq ymm13,ymm2,ymm8 + vpunpckhqdq ymm2,ymm2,ymm8 + vpunpcklqdq ymm9,ymm12,ymm15 + vpunpckhqdq ymm12,ymm12,ymm15 + vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] + vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] + vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] + vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] + + vpunpckldq ymm15,ymm4,ymm5 + vpunpckldq ymm8,ymm6,ymm7 + vpunpckhdq ymm4,ymm4,ymm5 + vpunpckhdq ymm6,ymm6,ymm7 + vpunpcklqdq ymm5,ymm15,ymm8 + vpunpckhqdq ymm15,ymm15,ymm8 + vpunpcklqdq ymm7,ymm4,ymm6 + vpunpckhqdq ymm4,ymm4,ymm6 + vperm2i128 ymm8,ymm13,ymm5,0x20 + vperm2i128 ymm5,ymm13,ymm5,0x31 + vperm2i128 ymm13,ymm2,ymm15,0x20 + vperm2i128 ymm15,ymm2,ymm15,0x31 + vperm2i128 ymm2,ymm9,ymm7,0x20 + vperm2i128 ymm7,ymm9,ymm7,0x31 + vperm2i128 ymm9,ymm12,ymm4,0x20 + vperm2i128 ymm4,ymm12,ymm4,0x31 + vmovdqa ymm6,YMMWORD[rsp] + vmovdqa ymm12,YMMWORD[32+rsp] + + cmp rdx,64*8 + jb NEAR $L$tail8x + + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + lea rdi,[128+rdi] + + vpxor ymm12,ymm12,YMMWORD[rsi] + vpxor ymm13,ymm13,YMMWORD[32+rsi] + vpxor ymm10,ymm10,YMMWORD[64+rsi] + vpxor ymm15,ymm15,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm12 + vmovdqu YMMWORD[32+rdi],ymm13 + vmovdqu YMMWORD[64+rdi],ymm10 + vmovdqu YMMWORD[96+rdi],ymm15 + lea rdi,[128+rdi] + + vpxor ymm14,ymm14,YMMWORD[rsi] + vpxor ymm2,ymm2,YMMWORD[32+rsi] + vpxor ymm3,ymm3,YMMWORD[64+rsi] + vpxor ymm7,ymm7,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm14 + vmovdqu YMMWORD[32+rdi],ymm2 + vmovdqu YMMWORD[64+rdi],ymm3 + vmovdqu YMMWORD[96+rdi],ymm7 + lea rdi,[128+rdi] + + vpxor ymm11,ymm11,YMMWORD[rsi] + vpxor ymm9,ymm9,YMMWORD[32+rsi] + vpxor ymm0,ymm0,YMMWORD[64+rsi] + vpxor ymm4,ymm4,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm11 + vmovdqu YMMWORD[32+rdi],ymm9 + vmovdqu YMMWORD[64+rdi],ymm0 + vmovdqu YMMWORD[96+rdi],ymm4 + lea rdi,[128+rdi] + + sub rdx,64*8 + jnz NEAR $L$oop_outer8x + + jmp NEAR $L$done8x + +$L$tail8x: + cmp rdx,448 + jae NEAR $L$448_or_more8x + cmp rdx,384 + jae NEAR $L$384_or_more8x + cmp rdx,320 + jae NEAR $L$320_or_more8x + cmp rdx,256 + jae NEAR $L$256_or_more8x + cmp rdx,192 + jae NEAR $L$192_or_more8x + cmp rdx,128 + jae NEAR $L$128_or_more8x + cmp rdx,64 + jae NEAR $L$64_or_more8x + + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm6 + vmovdqa YMMWORD[32+rsp],ymm8 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$64_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + je NEAR $L$done8x + + lea rsi,[64+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm1 + lea rdi,[64+rdi] + sub rdx,64 + vmovdqa YMMWORD[32+rsp],ymm5 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$128_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + je NEAR $L$done8x + + lea rsi,[128+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm12 + lea rdi,[128+rdi] + sub rdx,128 + vmovdqa YMMWORD[32+rsp],ymm13 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$192_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + je NEAR $L$done8x + + lea rsi,[192+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm10 + lea rdi,[192+rdi] + sub rdx,192 + vmovdqa YMMWORD[32+rsp],ymm15 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$256_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + je NEAR $L$done8x + + lea rsi,[256+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm14 + lea rdi,[256+rdi] + sub rdx,256 + vmovdqa YMMWORD[32+rsp],ymm2 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$320_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + je NEAR $L$done8x + + lea rsi,[320+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm3 + lea rdi,[320+rdi] + sub rdx,320 + vmovdqa YMMWORD[32+rsp],ymm7 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$384_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vpxor ymm3,ymm3,YMMWORD[320+rsi] + vpxor ymm7,ymm7,YMMWORD[352+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + vmovdqu YMMWORD[320+rdi],ymm3 + vmovdqu YMMWORD[352+rdi],ymm7 + je NEAR $L$done8x + + lea rsi,[384+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm11 + lea rdi,[384+rdi] + sub rdx,384 + vmovdqa YMMWORD[32+rsp],ymm9 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$448_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vpxor ymm3,ymm3,YMMWORD[320+rsi] + vpxor ymm7,ymm7,YMMWORD[352+rsi] + vpxor ymm11,ymm11,YMMWORD[384+rsi] + vpxor ymm9,ymm9,YMMWORD[416+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + vmovdqu YMMWORD[320+rdi],ymm3 + vmovdqu YMMWORD[352+rdi],ymm7 + vmovdqu YMMWORD[384+rdi],ymm11 + vmovdqu YMMWORD[416+rdi],ymm9 + je NEAR $L$done8x + + lea rsi,[448+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm0 + lea rdi,[448+rdi] + sub rdx,448 + vmovdqa YMMWORD[32+rsp],ymm4 + +$L$oop_tail8x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail8x + +$L$done8x: + vzeroall + lea r11,[((656+48))+rsp] + movaps xmm6,XMMWORD[((-48))+r11] + movaps xmm7,XMMWORD[((-32))+r11] + movaps xmm8,XMMWORD[((-16))+r11] + movaps xmm9,XMMWORD[r11] + movaps xmm10,XMMWORD[16+r11] + movaps xmm11,XMMWORD[32+r11] + movaps xmm12,XMMWORD[48+r11] + movaps xmm13,XMMWORD[64+r11] + movaps xmm14,XMMWORD[80+r11] + movaps xmm15,XMMWORD[96+r11] + mov rsp,QWORD[640+rsp] + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ChaCha20_8x: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/ec/p256-x86_64-asm.asm b/packager/third_party/boringssl/win-x86_64/crypto/ec/p256-x86_64-asm.asm new file mode 100644 index 0000000000..a2e4075819 --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/ec/p256-x86_64-asm.asm @@ -0,0 +1,1925 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + +EXTERN OPENSSL_ia32cap_P + + +ALIGN 64 +$L$poly: + DQ 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 + +$L$One: + DD 1,1,1,1,1,1,1,1 +$L$Two: + DD 2,2,2,2,2,2,2,2 +$L$Three: + DD 3,3,3,3,3,3,3,3 +$L$ONE_mont: + DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe + + +ALIGN 64 +ecp_nistz256_mul_by_2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_mul_by_2: + mov rdi,rcx + mov rsi,rdx + + + push r12 + push r13 + + mov r8,QWORD[rsi] + mov r9,QWORD[8+rsi] + add r8,r8 + mov r10,QWORD[16+rsi] + adc r9,r9 + mov r11,QWORD[24+rsi] + lea rsi,[$L$poly] + mov rax,r8 + adc r10,r10 + adc r11,r11 + mov rdx,r9 + sbb r13,r13 + + sub r8,QWORD[rsi] + mov rcx,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r12,r11 + sbb r11,QWORD[24+rsi] + test r13,r13 + + cmovz r8,rax + cmovz r9,rdx + mov QWORD[rdi],r8 + cmovz r10,rcx + mov QWORD[8+rdi],r9 + cmovz r11,r12 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + pop r13 + pop r12 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_mul_by_2: + + + +global ecp_nistz256_neg + +ALIGN 32 +ecp_nistz256_neg: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_neg: + mov rdi,rcx + mov rsi,rdx + + + push r12 + push r13 + + xor r8,r8 + xor r9,r9 + xor r10,r10 + xor r11,r11 + xor r13,r13 + + sub r8,QWORD[rsi] + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov rax,r8 + sbb r11,QWORD[24+rsi] + lea rsi,[$L$poly] + mov rdx,r9 + sbb r13,0 + + add r8,QWORD[rsi] + mov rcx,r10 + adc r9,QWORD[8+rsi] + adc r10,QWORD[16+rsi] + mov r12,r11 + adc r11,QWORD[24+rsi] + test r13,r13 + + cmovz r8,rax + cmovz r9,rdx + mov QWORD[rdi],r8 + cmovz r10,rcx + mov QWORD[8+rdi],r9 + cmovz r11,r12 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + pop r13 + pop r12 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_neg: + + + + + + +global ecp_nistz256_mul_mont + +ALIGN 32 +ecp_nistz256_mul_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_mul_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$mul_mont: + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + mov rbx,rdx + mov rax,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + + call __ecp_nistz256_mul_montq +$L$mul_mont_done: + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_mul_mont: + + +ALIGN 32 +__ecp_nistz256_mul_montq: + + + mov rbp,rax + mul r9 + mov r14,QWORD[(($L$poly+8))] + mov r8,rax + mov rax,rbp + mov r9,rdx + + mul r10 + mov r15,QWORD[(($L$poly+24))] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul r11 + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul r12 + add r11,rax + mov rax,r8 + adc rdx,0 + xor r13,r13 + mov r12,rdx + + + + + + + + + + + mov rbp,r8 + shl r8,32 + mul r15 + shr rbp,32 + add r9,r8 + adc r10,rbp + adc r11,rax + mov rax,QWORD[8+rbx] + adc r12,rdx + adc r13,0 + xor r8,r8 + + + + mov rbp,rax + mul QWORD[rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + + mov rbp,r9 + shl r9,32 + mul r15 + shr rbp,32 + add r10,r9 + adc r11,rbp + adc r12,rax + mov rax,QWORD[16+rbx] + adc r13,rdx + adc r8,0 + xor r9,r9 + + + + mov rbp,rax + mul QWORD[rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r13,rcx + adc rdx,0 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + + mov rbp,r10 + shl r10,32 + mul r15 + shr rbp,32 + add r11,r10 + adc r12,rbp + adc r13,rax + mov rax,QWORD[24+rbx] + adc r8,rdx + adc r9,0 + xor r10,r10 + + + + mov rbp,rax + mul QWORD[rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r13,rcx + adc rdx,0 + add r13,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + + mov rbp,r11 + shl r11,32 + mul r15 + shr rbp,32 + add r12,r11 + adc r13,rbp + mov rcx,r12 + adc r8,rax + adc r9,rdx + mov rbp,r13 + adc r10,0 + + + + sub r12,-1 + mov rbx,r8 + sbb r13,r14 + sbb r8,0 + mov rdx,r9 + sbb r9,r15 + sbb r10,0 + + cmovc r12,rcx + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rbx + mov QWORD[8+rdi],r13 + cmovc r9,rdx + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + + + + + + + +global ecp_nistz256_sqr_mont + +ALIGN 32 +ecp_nistz256_sqr_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_sqr_mont: + mov rdi,rcx + mov rsi,rdx + + + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + mov rax,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + + call __ecp_nistz256_sqr_montq +$L$sqr_mont_done: + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_sqr_mont: + + +ALIGN 32 +__ecp_nistz256_sqr_montq: + mov r13,rax + mul r14 + mov r9,rax + mov rax,r15 + mov r10,rdx + + mul r13 + add r10,rax + mov rax,r8 + adc rdx,0 + mov r11,rdx + + mul r13 + add r11,rax + mov rax,r15 + adc rdx,0 + mov r12,rdx + + + mul r14 + add r11,rax + mov rax,r8 + adc rdx,0 + mov rbp,rdx + + mul r14 + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,rbp + mov r13,rdx + adc r13,0 + + + mul r15 + xor r15,r15 + add r13,rax + mov rax,QWORD[rsi] + mov r14,rdx + adc r14,0 + + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + mul rax + mov r8,rax + mov rax,QWORD[8+rsi] + mov rcx,rdx + + mul rax + add r9,rcx + adc r10,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + mov rcx,rdx + + mul rax + add r11,rcx + adc r12,rax + mov rax,QWORD[24+rsi] + adc rdx,0 + mov rcx,rdx + + mul rax + add r13,rcx + adc r14,rax + mov rax,r8 + adc r15,rdx + + mov rsi,QWORD[(($L$poly+8))] + mov rbp,QWORD[(($L$poly+24))] + + + + + mov rcx,r8 + shl r8,32 + mul rbp + shr rcx,32 + add r9,r8 + adc r10,rcx + adc r11,rax + mov rax,r9 + adc rdx,0 + + + + mov rcx,r9 + shl r9,32 + mov r8,rdx + mul rbp + shr rcx,32 + add r10,r9 + adc r11,rcx + adc r8,rax + mov rax,r10 + adc rdx,0 + + + + mov rcx,r10 + shl r10,32 + mov r9,rdx + mul rbp + shr rcx,32 + add r11,r10 + adc r8,rcx + adc r9,rax + mov rax,r11 + adc rdx,0 + + + + mov rcx,r11 + shl r11,32 + mov r10,rdx + mul rbp + shr rcx,32 + add r8,r11 + adc r9,rcx + adc r10,rax + adc rdx,0 + xor r11,r11 + + + + add r12,r8 + adc r13,r9 + mov r8,r12 + adc r14,r10 + adc r15,rdx + mov r9,r13 + adc r11,0 + + sub r12,-1 + mov r10,r14 + sbb r13,rsi + sbb r14,0 + mov rcx,r15 + sbb r15,rbp + sbb r11,0 + + cmovc r12,r8 + cmovc r13,r9 + mov QWORD[rdi],r12 + cmovc r14,r10 + mov QWORD[8+rdi],r13 + cmovc r15,rcx + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + + DB 0F3h,0C3h ;repret + + + + + + + +global ecp_nistz256_from_mont + +ALIGN 32 +ecp_nistz256_from_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_from_mont: + mov rdi,rcx + mov rsi,rdx + + + push r12 + push r13 + + mov rax,QWORD[rsi] + mov r13,QWORD[(($L$poly+24))] + mov r9,QWORD[8+rsi] + mov r10,QWORD[16+rsi] + mov r11,QWORD[24+rsi] + mov r8,rax + mov r12,QWORD[(($L$poly+8))] + + + + mov rcx,rax + shl r8,32 + mul r13 + shr rcx,32 + add r9,r8 + adc r10,rcx + adc r11,rax + mov rax,r9 + adc rdx,0 + + + + mov rcx,r9 + shl r9,32 + mov r8,rdx + mul r13 + shr rcx,32 + add r10,r9 + adc r11,rcx + adc r8,rax + mov rax,r10 + adc rdx,0 + + + + mov rcx,r10 + shl r10,32 + mov r9,rdx + mul r13 + shr rcx,32 + add r11,r10 + adc r8,rcx + adc r9,rax + mov rax,r11 + adc rdx,0 + + + + mov rcx,r11 + shl r11,32 + mov r10,rdx + mul r13 + shr rcx,32 + add r8,r11 + adc r9,rcx + mov rcx,r8 + adc r10,rax + mov rsi,r9 + adc rdx,0 + + sub r8,-1 + mov rax,r10 + sbb r9,r12 + sbb r10,0 + mov r11,rdx + sbb rdx,r13 + sbb r13,r13 + + cmovnz r8,rcx + cmovnz r9,rsi + mov QWORD[rdi],r8 + cmovnz r10,rax + mov QWORD[8+rdi],r9 + cmovz r11,rdx + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + pop r13 + pop r12 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_from_mont: + + +global ecp_nistz256_select_w5 + +ALIGN 32 +ecp_nistz256_select_w5: + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_select_w5: +DB 0x48,0x8d,0x60,0xe0 +DB 0x0f,0x29,0x70,0xe0 +DB 0x0f,0x29,0x78,0xf0 +DB 0x44,0x0f,0x29,0x00 +DB 0x44,0x0f,0x29,0x48,0x10 +DB 0x44,0x0f,0x29,0x50,0x20 +DB 0x44,0x0f,0x29,0x58,0x30 +DB 0x44,0x0f,0x29,0x60,0x40 +DB 0x44,0x0f,0x29,0x68,0x50 +DB 0x44,0x0f,0x29,0x70,0x60 +DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm0,XMMWORD[$L$One] + movd xmm1,r8d + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + + movdqa xmm8,xmm0 + pshufd xmm1,xmm1,0 + + mov rax,16 +$L$select_loop_sse_w5: + + movdqa xmm15,xmm8 + paddd xmm8,xmm0 + pcmpeqd xmm15,xmm1 + + movdqa xmm9,XMMWORD[rdx] + movdqa xmm10,XMMWORD[16+rdx] + movdqa xmm11,XMMWORD[32+rdx] + movdqa xmm12,XMMWORD[48+rdx] + movdqa xmm13,XMMWORD[64+rdx] + movdqa xmm14,XMMWORD[80+rdx] + lea rdx,[96+rdx] + + pand xmm9,xmm15 + pand xmm10,xmm15 + por xmm2,xmm9 + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + pand xmm13,xmm15 + por xmm5,xmm12 + pand xmm14,xmm15 + por xmm6,xmm13 + por xmm7,xmm14 + + dec rax + jnz NEAR $L$select_loop_sse_w5 + + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movdqu XMMWORD[64+rcx],xmm6 + movdqu XMMWORD[80+rcx],xmm7 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] +$L$SEH_end_ecp_nistz256_select_w5: + DB 0F3h,0C3h ;repret + + + + +global ecp_nistz256_select_w7 + +ALIGN 32 +ecp_nistz256_select_w7: + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_select_w7: +DB 0x48,0x8d,0x60,0xe0 +DB 0x0f,0x29,0x70,0xe0 +DB 0x0f,0x29,0x78,0xf0 +DB 0x44,0x0f,0x29,0x00 +DB 0x44,0x0f,0x29,0x48,0x10 +DB 0x44,0x0f,0x29,0x50,0x20 +DB 0x44,0x0f,0x29,0x58,0x30 +DB 0x44,0x0f,0x29,0x60,0x40 +DB 0x44,0x0f,0x29,0x68,0x50 +DB 0x44,0x0f,0x29,0x70,0x60 +DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm8,XMMWORD[$L$One] + movd xmm1,r8d + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + + movdqa xmm0,xmm8 + pshufd xmm1,xmm1,0 + mov rax,64 + +$L$select_loop_sse_w7: + movdqa xmm15,xmm8 + paddd xmm8,xmm0 + movdqa xmm9,XMMWORD[rdx] + movdqa xmm10,XMMWORD[16+rdx] + pcmpeqd xmm15,xmm1 + movdqa xmm11,XMMWORD[32+rdx] + movdqa xmm12,XMMWORD[48+rdx] + lea rdx,[64+rdx] + + pand xmm9,xmm15 + pand xmm10,xmm15 + por xmm2,xmm9 + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + prefetcht0 [255+rdx] + por xmm5,xmm12 + + dec rax + jnz NEAR $L$select_loop_sse_w7 + + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] +$L$SEH_end_ecp_nistz256_select_w7: + DB 0F3h,0C3h ;repret + +global ecp_nistz256_avx2_select_w7 + +ALIGN 32 +ecp_nistz256_avx2_select_w7: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_avx2_select_w7: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +DB 0x0f,0x0b + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_avx2_select_w7: + +ALIGN 32 +__ecp_nistz256_add_toq: + add r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + mov rax,r12 + adc r8,QWORD[16+rbx] + adc r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,r11 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + test r11,r11 + + cmovz r12,rax + cmovz r13,rbp + mov QWORD[rdi],r12 + cmovz r8,rcx + mov QWORD[8+rdi],r13 + cmovz r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ecp_nistz256_sub_fromq: + sub r12,QWORD[rbx] + sbb r13,QWORD[8+rbx] + mov rax,r12 + sbb r8,QWORD[16+rbx] + sbb r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,r11 + + add r12,-1 + mov rcx,r8 + adc r13,r14 + adc r8,0 + mov r10,r9 + adc r9,r15 + test r11,r11 + + cmovz r12,rax + cmovz r13,rbp + mov QWORD[rdi],r12 + cmovz r8,rcx + mov QWORD[8+rdi],r13 + cmovz r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ecp_nistz256_subq: + sub rax,r12 + sbb rbp,r13 + mov r12,rax + sbb rcx,r8 + sbb r10,r9 + mov r13,rbp + sbb r11,r11 + + add rax,-1 + mov r8,rcx + adc rbp,r14 + adc rcx,0 + mov r9,r10 + adc r10,r15 + test r11,r11 + + cmovnz r12,rax + cmovnz r13,rbp + cmovnz r8,rcx + cmovnz r9,r10 + + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ecp_nistz256_mul_by_2q: + add r12,r12 + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + sbb r11,r11 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + test r11,r11 + + cmovz r12,rax + cmovz r13,rbp + mov QWORD[rdi],r12 + cmovz r8,rcx + mov QWORD[8+rdi],r13 + cmovz r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + DB 0F3h,0C3h ;repret + +global ecp_nistz256_point_double + +ALIGN 32 +ecp_nistz256_point_double: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_double: + mov rdi,rcx + mov rsi,rdx + + + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*5+8 + +$L$point_double_shortcutq: + movdqu xmm0,XMMWORD[rsi] + mov rbx,rsi + movdqu xmm1,XMMWORD[16+rsi] + mov r12,QWORD[((32+0))+rsi] + mov r13,QWORD[((32+8))+rsi] + mov r8,QWORD[((32+16))+rsi] + mov r9,QWORD[((32+24))+rsi] + mov r14,QWORD[(($L$poly+8))] + mov r15,QWORD[(($L$poly+24))] + movdqa XMMWORD[96+rsp],xmm0 + movdqa XMMWORD[(96+16)+rsp],xmm1 + lea r10,[32+rdi] + lea r11,[64+rdi] +DB 102,72,15,110,199 +DB 102,73,15,110,202 +DB 102,73,15,110,211 + + lea rdi,[rsp] + call __ecp_nistz256_mul_by_2q + + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + lea rsi,[((64-0))+rsi] + lea rdi,[64+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[32+rbx] + mov r9,QWORD[((64+0))+rbx] + mov r10,QWORD[((64+8))+rbx] + mov r11,QWORD[((64+16))+rbx] + mov r12,QWORD[((64+24))+rbx] + lea rsi,[((64-0))+rbx] + lea rbx,[32+rbx] +DB 102,72,15,126,215 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_toq + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] +DB 102,72,15,126,207 + call __ecp_nistz256_sqr_montq + xor r9,r9 + mov rax,r12 + add r12,-1 + mov r10,r13 + adc r13,rsi + mov rcx,r14 + adc r14,0 + mov r8,r15 + adc r15,rbp + adc r9,0 + xor rsi,rsi + test rax,1 + + cmovz r12,rax + cmovz r13,r10 + cmovz r14,rcx + cmovz r15,r8 + cmovz r9,rsi + + mov rax,r13 + shr r12,1 + shl rax,63 + mov r10,r14 + shr r13,1 + or r12,rax + shl r10,63 + mov rcx,r15 + shr r14,1 + or r13,r10 + shl rcx,63 + mov QWORD[rdi],r12 + shr r15,1 + mov QWORD[8+rdi],r13 + shl r9,63 + or r14,rcx + or r15,r9 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + mov rax,QWORD[64+rsp] + lea rbx,[64+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2q + + lea rbx,[32+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_toq + + mov rax,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2q + + mov rax,QWORD[((0+32))+rsp] + mov r14,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r15,QWORD[((16+32))+rsp] + mov r8,QWORD[((24+32))+rsp] +DB 102,72,15,126,199 + call __ecp_nistz256_sqr_montq + + lea rbx,[128+rsp] + mov r8,r14 + mov r9,r15 + mov r14,rsi + mov r15,rbp + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_subq + + mov rax,QWORD[32+rsp] + lea rbx,[32+rsp] + mov r14,r12 + xor ecx,ecx + mov QWORD[((0+0))+rsp],r12 + mov r10,r13 + mov QWORD[((0+8))+rsp],r13 + cmovz r11,r8 + mov QWORD[((0+16))+rsp],r8 + lea rsi,[((0-0))+rsp] + cmovz r12,r9 + mov QWORD[((0+24))+rsp],r9 + mov r9,r14 + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + +DB 102,72,15,126,203 +DB 102,72,15,126,207 + call __ecp_nistz256_sub_fromq + + add rsp,32*5+8 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_point_double: +global ecp_nistz256_point_add + +ALIGN 32 +ecp_nistz256_point_add: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*18+8 + + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rbx,rsi + mov rsi,rdx + movdqa XMMWORD[384+rsp],xmm0 + movdqa XMMWORD[(384+16)+rsp],xmm1 + por xmm1,xmm0 + movdqa XMMWORD[416+rsp],xmm2 + movdqa XMMWORD[(416+16)+rsp],xmm3 + por xmm3,xmm2 + movdqa XMMWORD[448+rsp],xmm4 + movdqa XMMWORD[(448+16)+rsp],xmm5 + por xmm3,xmm1 + + movdqu xmm0,XMMWORD[rsi] + pshufd xmm5,xmm3,0xb1 + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rsi] + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[480+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(480+16)+rsp],xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 + movdqa XMMWORD[512+rsp],xmm2 + movdqa XMMWORD[(512+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-0))+rsi] + mov QWORD[((544+0))+rsp],rax + mov QWORD[((544+8))+rsp],r14 + mov QWORD[((544+16))+rsp],r15 + mov QWORD[((544+24))+rsp],r8 + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montq + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + por xmm4,xmm3 + pxor xmm3,xmm3 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + mov rax,QWORD[((64+0))+rbx] + mov r14,QWORD[((64+8))+rbx] + mov r15,QWORD[((64+16))+rbx] + mov r8,QWORD[((64+24))+rbx] +DB 102,72,15,110,203 + + lea rsi,[((64-0))+rbx] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[416+rsp] + lea rbx,[416+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((0+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[512+rsp] + lea rbx,[512+rsp] + mov r9,QWORD[((0+256))+rsp] + mov r10,QWORD[((8+256))+rsp] + lea rsi,[((0+256))+rsp] + mov r11,QWORD[((16+256))+rsp] + mov r12,QWORD[((24+256))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[224+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + or r12,r13 + movdqa xmm2,xmm4 + or r12,r8 + or r12,r9 + por xmm2,xmm5 +DB 102,73,15,110,220 + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[480+rsp] + lea rbx,[480+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[160+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sub_fromq + + or r12,r13 + or r12,r8 + or r12,r9 + +DB 0x3e + jnz NEAR $L$add_proceedq +DB 102,73,15,126,208 +DB 102,73,15,126,217 + test r8,r8 + jnz NEAR $L$add_proceedq + test r9,r9 + jz NEAR $L$add_doubleq + +DB 102,72,15,126,199 + pxor xmm0,xmm0 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[80+rdi],xmm0 + jmp NEAR $L$add_doneq + +ALIGN 32 +$L$add_doubleq: +DB 102,72,15,126,206 +DB 102,72,15,126,199 + add rsp,416 + jmp NEAR $L$point_double_shortcutq + +ALIGN 32 +$L$add_proceedq: + mov rax,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+352))+rsp] + mov r10,QWORD[((8+352))+rsp] + lea rsi,[((0+352))+rsp] + mov r11,QWORD[((16+352))+rsp] + mov r12,QWORD[((24+352))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[rsp] + lea rbx,[rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[160+rsp] + lea rbx,[160+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montq + + + + + add r12,r12 + lea rsi,[96+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + sbb r11,r11 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + test r11,r11 + + cmovz r12,rax + mov rax,QWORD[rsi] + cmovz r13,rbp + mov rbp,QWORD[8+rsi] + cmovz r8,rcx + mov rcx,QWORD[16+rsi] + cmovz r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subq + + lea rbx,[128+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((192+0))+rsp] + mov rbp,QWORD[((192+8))+rsp] + mov rcx,QWORD[((192+16))+rsp] + mov r10,QWORD[((192+24))+rsp] + lea rdi,[320+rsp] + + call __ecp_nistz256_subq + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rax,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((0+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[256+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_sub_fromq + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[352+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((352+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[544+rsp] + pand xmm3,XMMWORD[((544+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[480+rsp] + pand xmm3,XMMWORD[((480+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[320+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((320+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[512+rsp] + pand xmm3,XMMWORD[((512+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + +$L$add_doneq: + add rsp,32*18+8 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_point_add: +global ecp_nistz256_point_add_affine + +ALIGN 32 +ecp_nistz256_point_add_affine: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_affine: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + sub rsp,32*15+8 + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rdx + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[320+rsp],xmm0 + movdqa XMMWORD[(320+16)+rsp],xmm1 + por xmm1,xmm0 + movdqa XMMWORD[352+rsp],xmm2 + movdqa XMMWORD[(352+16)+rsp],xmm3 + por xmm3,xmm2 + movdqa XMMWORD[384+rsp],xmm4 + movdqa XMMWORD[(384+16)+rsp],xmm5 + por xmm3,xmm1 + + movdqu xmm0,XMMWORD[rbx] + pshufd xmm5,xmm3,0xb1 + movdqu xmm1,XMMWORD[16+rbx] + movdqu xmm2,XMMWORD[32+rbx] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rbx] + movdqa XMMWORD[416+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(416+16)+rsp],xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 + movdqa XMMWORD[448+rsp],xmm2 + movdqa XMMWORD[(448+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-0))+rsi] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + mov rax,QWORD[rbx] + + mov r9,r12 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + mov r10,r13 + por xmm4,xmm3 + pxor xmm3,xmm3 + mov r11,r14 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + + lea rsi,[((32-0))+rsp] + mov r12,r15 + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[320+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[352+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[((0+96))+rsp] + mov r14,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r15,QWORD[((16+96))+rsp] + mov r8,QWORD[((24+96))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+128))+rsp] + mov r10,QWORD[((8+128))+rsp] + lea rsi,[((0+128))+rsp] + mov r11,QWORD[((16+128))+rsp] + mov r12,QWORD[((24+128))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + + + + add r12,r12 + lea rsi,[192+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + sbb r11,r11 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + test r11,r11 + + cmovz r12,rax + mov rax,QWORD[rsi] + cmovz r13,rbp + mov rbp,QWORD[8+rsi] + cmovz r8,rcx + mov rcx,QWORD[16+rsi] + cmovz r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subq + + lea rbx,[160+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[64+rsp] + + call __ecp_nistz256_subq + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rax,QWORD[352+rsp] + lea rbx,[352+rsp] + mov r9,QWORD[((0+160))+rsp] + mov r10,QWORD[((8+160))+rsp] + lea rsi,[((0+160))+rsp] + mov r11,QWORD[((16+160))+rsp] + mov r12,QWORD[((24+160))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[32+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_sub_fromq + +DB 102,72,15,126,199 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[$L$ONE_mont] + pand xmm3,XMMWORD[(($L$ONE_mont+16))] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[224+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((224+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[320+rsp] + pand xmm3,XMMWORD[((320+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[256+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((256+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[352+rsp] + pand xmm3,XMMWORD[((352+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + + add rsp,32*15+8 + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_ecp_nistz256_point_add_affine: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha1-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/sha/sha1-x86_64.asm index 0f5361a1b5..168f78db3f 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha1-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/sha/sha1-x86_64.asm @@ -24,6 +24,11 @@ $L$SEH_begin_sha1_block_data_order: mov r10d,DWORD[((OPENSSL_ia32cap_P+8))] test r8d,512 jz NEAR $L$ialu + and r8d,268435456 + and r9d,1073741824 + or r8d,r9d + cmp r8d,1342177280 + je NEAR _avx_shortcut jmp NEAR _ssse3_shortcut ALIGN 16 @@ -2445,6 +2450,1146 @@ $L$epilogue_ssse3: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_sha1_block_data_order_ssse3: + +ALIGN 16 +sha1_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha1_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +_avx_shortcut: + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] + vzeroupper + vmovaps XMMWORD[(-40-96)+rax],xmm6 + vmovaps XMMWORD[(-40-80)+rax],xmm7 + vmovaps XMMWORD[(-40-64)+rax],xmm8 + vmovaps XMMWORD[(-40-48)+rax],xmm9 + vmovaps XMMWORD[(-40-32)+rax],xmm10 + vmovaps XMMWORD[(-40-16)+rax],xmm11 +$L$prologue_avx: + mov r14,rax + and rsp,-64 + mov r8,rdi + mov r9,rsi + mov r10,rdx + + shl r10,6 + add r10,r9 + lea r11,[((K_XX_XX+64))] + + mov eax,DWORD[r8] + mov ebx,DWORD[4+r8] + mov ecx,DWORD[8+r8] + mov edx,DWORD[12+r8] + mov esi,ebx + mov ebp,DWORD[16+r8] + mov edi,ecx + xor edi,edx + and esi,edi + + vmovdqa xmm6,XMMWORD[64+r11] + vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqu xmm0,XMMWORD[r9] + vmovdqu xmm1,XMMWORD[16+r9] + vmovdqu xmm2,XMMWORD[32+r9] + vmovdqu xmm3,XMMWORD[48+r9] + vpshufb xmm0,xmm0,xmm6 + add r9,64 + vpshufb xmm1,xmm1,xmm6 + vpshufb xmm2,xmm2,xmm6 + vpshufb xmm3,xmm3,xmm6 + vpaddd xmm4,xmm0,xmm11 + vpaddd xmm5,xmm1,xmm11 + vpaddd xmm6,xmm2,xmm11 + vmovdqa XMMWORD[rsp],xmm4 + vmovdqa XMMWORD[16+rsp],xmm5 + vmovdqa XMMWORD[32+rsp],xmm6 + jmp NEAR $L$oop_avx +ALIGN 16 +$L$oop_avx: + shrd ebx,ebx,2 + xor esi,edx + vpalignr xmm4,xmm1,xmm0,8 + mov edi,eax + add ebp,DWORD[rsp] + vpaddd xmm9,xmm11,xmm3 + xor ebx,ecx + shld eax,eax,5 + vpsrldq xmm8,xmm3,4 + add ebp,esi + and edi,ebx + vpxor xmm4,xmm4,xmm0 + xor ebx,ecx + add ebp,eax + vpxor xmm8,xmm8,xmm2 + shrd eax,eax,7 + xor edi,ecx + mov esi,ebp + add edx,DWORD[4+rsp] + vpxor xmm4,xmm4,xmm8 + xor eax,ebx + shld ebp,ebp,5 + vmovdqa XMMWORD[48+rsp],xmm9 + add edx,edi + and esi,eax + vpsrld xmm8,xmm4,31 + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor esi,ebx + vpslldq xmm10,xmm4,12 + vpaddd xmm4,xmm4,xmm4 + mov edi,edx + add ecx,DWORD[8+rsp] + xor ebp,eax + shld edx,edx,5 + vpsrld xmm9,xmm10,30 + vpor xmm4,xmm4,xmm8 + add ecx,esi + and edi,ebp + xor ebp,eax + add ecx,edx + vpslld xmm10,xmm10,2 + vpxor xmm4,xmm4,xmm9 + shrd edx,edx,7 + xor edi,eax + mov esi,ecx + add ebx,DWORD[12+rsp] + vpxor xmm4,xmm4,xmm10 + xor edx,ebp + shld ecx,ecx,5 + add ebx,edi + and esi,edx + xor edx,ebp + add ebx,ecx + shrd ecx,ecx,7 + xor esi,ebp + vpalignr xmm5,xmm2,xmm1,8 + mov edi,ebx + add eax,DWORD[16+rsp] + vpaddd xmm9,xmm11,xmm4 + xor ecx,edx + shld ebx,ebx,5 + vpsrldq xmm8,xmm4,4 + add eax,esi + and edi,ecx + vpxor xmm5,xmm5,xmm1 + xor ecx,edx + add eax,ebx + vpxor xmm8,xmm8,xmm3 + shrd ebx,ebx,7 + xor edi,edx + mov esi,eax + add ebp,DWORD[20+rsp] + vpxor xmm5,xmm5,xmm8 + xor ebx,ecx + shld eax,eax,5 + vmovdqa XMMWORD[rsp],xmm9 + add ebp,edi + and esi,ebx + vpsrld xmm8,xmm5,31 + xor ebx,ecx + add ebp,eax + shrd eax,eax,7 + xor esi,ecx + vpslldq xmm10,xmm5,12 + vpaddd xmm5,xmm5,xmm5 + mov edi,ebp + add edx,DWORD[24+rsp] + xor eax,ebx + shld ebp,ebp,5 + vpsrld xmm9,xmm10,30 + vpor xmm5,xmm5,xmm8 + add edx,esi + and edi,eax + xor eax,ebx + add edx,ebp + vpslld xmm10,xmm10,2 + vpxor xmm5,xmm5,xmm9 + shrd ebp,ebp,7 + xor edi,ebx + mov esi,edx + add ecx,DWORD[28+rsp] + vpxor xmm5,xmm5,xmm10 + xor ebp,eax + shld edx,edx,5 + vmovdqa xmm11,XMMWORD[((-32))+r11] + add ecx,edi + and esi,ebp + xor ebp,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + vpalignr xmm6,xmm3,xmm2,8 + mov edi,ecx + add ebx,DWORD[32+rsp] + vpaddd xmm9,xmm11,xmm5 + xor edx,ebp + shld ecx,ecx,5 + vpsrldq xmm8,xmm5,4 + add ebx,esi + and edi,edx + vpxor xmm6,xmm6,xmm2 + xor edx,ebp + add ebx,ecx + vpxor xmm8,xmm8,xmm4 + shrd ecx,ecx,7 + xor edi,ebp + mov esi,ebx + add eax,DWORD[36+rsp] + vpxor xmm6,xmm6,xmm8 + xor ecx,edx + shld ebx,ebx,5 + vmovdqa XMMWORD[16+rsp],xmm9 + add eax,edi + and esi,ecx + vpsrld xmm8,xmm6,31 + xor ecx,edx + add eax,ebx + shrd ebx,ebx,7 + xor esi,edx + vpslldq xmm10,xmm6,12 + vpaddd xmm6,xmm6,xmm6 + mov edi,eax + add ebp,DWORD[40+rsp] + xor ebx,ecx + shld eax,eax,5 + vpsrld xmm9,xmm10,30 + vpor xmm6,xmm6,xmm8 + add ebp,esi + and edi,ebx + xor ebx,ecx + add ebp,eax + vpslld xmm10,xmm10,2 + vpxor xmm6,xmm6,xmm9 + shrd eax,eax,7 + xor edi,ecx + mov esi,ebp + add edx,DWORD[44+rsp] + vpxor xmm6,xmm6,xmm10 + xor eax,ebx + shld ebp,ebp,5 + add edx,edi + and esi,eax + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor esi,ebx + vpalignr xmm7,xmm4,xmm3,8 + mov edi,edx + add ecx,DWORD[48+rsp] + vpaddd xmm9,xmm11,xmm6 + xor ebp,eax + shld edx,edx,5 + vpsrldq xmm8,xmm6,4 + add ecx,esi + and edi,ebp + vpxor xmm7,xmm7,xmm3 + xor ebp,eax + add ecx,edx + vpxor xmm8,xmm8,xmm5 + shrd edx,edx,7 + xor edi,eax + mov esi,ecx + add ebx,DWORD[52+rsp] + vpxor xmm7,xmm7,xmm8 + xor edx,ebp + shld ecx,ecx,5 + vmovdqa XMMWORD[32+rsp],xmm9 + add ebx,edi + and esi,edx + vpsrld xmm8,xmm7,31 + xor edx,ebp + add ebx,ecx + shrd ecx,ecx,7 + xor esi,ebp + vpslldq xmm10,xmm7,12 + vpaddd xmm7,xmm7,xmm7 + mov edi,ebx + add eax,DWORD[56+rsp] + xor ecx,edx + shld ebx,ebx,5 + vpsrld xmm9,xmm10,30 + vpor xmm7,xmm7,xmm8 + add eax,esi + and edi,ecx + xor ecx,edx + add eax,ebx + vpslld xmm10,xmm10,2 + vpxor xmm7,xmm7,xmm9 + shrd ebx,ebx,7 + xor edi,edx + mov esi,eax + add ebp,DWORD[60+rsp] + vpxor xmm7,xmm7,xmm10 + xor ebx,ecx + shld eax,eax,5 + add ebp,edi + and esi,ebx + xor ebx,ecx + add ebp,eax + vpalignr xmm8,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + shrd eax,eax,7 + xor esi,ecx + mov edi,ebp + add edx,DWORD[rsp] + vpxor xmm0,xmm0,xmm1 + xor eax,ebx + shld ebp,ebp,5 + vpaddd xmm9,xmm11,xmm7 + add edx,esi + and edi,eax + vpxor xmm0,xmm0,xmm8 + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor edi,ebx + vpsrld xmm8,xmm0,30 + vmovdqa XMMWORD[48+rsp],xmm9 + mov esi,edx + add ecx,DWORD[4+rsp] + xor ebp,eax + shld edx,edx,5 + vpslld xmm0,xmm0,2 + add ecx,edi + and esi,ebp + xor ebp,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + mov edi,ecx + add ebx,DWORD[8+rsp] + vpor xmm0,xmm0,xmm8 + xor edx,ebp + shld ecx,ecx,5 + add ebx,esi + and edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[12+rsp] + xor edi,ebp + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpalignr xmm8,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ebp,DWORD[16+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + vpxor xmm1,xmm1,xmm2 + add ebp,esi + xor edi,ecx + vpaddd xmm9,xmm11,xmm0 + shrd ebx,ebx,7 + add ebp,eax + vpxor xmm1,xmm1,xmm8 + add edx,DWORD[20+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + vpsrld xmm8,xmm1,30 + vmovdqa XMMWORD[rsp],xmm9 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpslld xmm1,xmm1,2 + add ecx,DWORD[24+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vpor xmm1,xmm1,xmm8 + add ebx,DWORD[28+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + vpalignr xmm8,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add eax,DWORD[32+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + vpxor xmm2,xmm2,xmm3 + add eax,esi + xor edi,edx + vpaddd xmm9,xmm11,xmm1 + vmovdqa xmm11,XMMWORD[r11] + shrd ecx,ecx,7 + add eax,ebx + vpxor xmm2,xmm2,xmm8 + add ebp,DWORD[36+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + vpsrld xmm8,xmm2,30 + vmovdqa XMMWORD[16+rsp],xmm9 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpslld xmm2,xmm2,2 + add edx,DWORD[40+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + vpor xmm2,xmm2,xmm8 + add ecx,DWORD[44+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + vpalignr xmm8,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebx,DWORD[48+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + vpxor xmm3,xmm3,xmm4 + add ebx,esi + xor edi,ebp + vpaddd xmm9,xmm11,xmm2 + shrd edx,edx,7 + add ebx,ecx + vpxor xmm3,xmm3,xmm8 + add eax,DWORD[52+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + vpsrld xmm8,xmm3,30 + vmovdqa XMMWORD[32+rsp],xmm9 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpslld xmm3,xmm3,2 + add ebp,DWORD[56+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpor xmm3,xmm3,xmm8 + add edx,DWORD[60+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpalignr xmm8,xmm3,xmm2,8 + vpxor xmm4,xmm4,xmm0 + add ecx,DWORD[rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + vpxor xmm4,xmm4,xmm5 + add ecx,esi + xor edi,eax + vpaddd xmm9,xmm11,xmm3 + shrd ebp,ebp,7 + add ecx,edx + vpxor xmm4,xmm4,xmm8 + add ebx,DWORD[4+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + vpsrld xmm8,xmm4,30 + vmovdqa XMMWORD[48+rsp],xmm9 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + vpslld xmm4,xmm4,2 + add eax,DWORD[8+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + vpor xmm4,xmm4,xmm8 + add ebp,DWORD[12+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpalignr xmm8,xmm4,xmm3,8 + vpxor xmm5,xmm5,xmm1 + add edx,DWORD[16+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + vpxor xmm5,xmm5,xmm6 + add edx,esi + xor edi,ebx + vpaddd xmm9,xmm11,xmm4 + shrd eax,eax,7 + add edx,ebp + vpxor xmm5,xmm5,xmm8 + add ecx,DWORD[20+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + vpsrld xmm8,xmm5,30 + vmovdqa XMMWORD[rsp],xmm9 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + vpslld xmm5,xmm5,2 + add ebx,DWORD[24+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + vpor xmm5,xmm5,xmm8 + add eax,DWORD[28+rsp] + shrd ecx,ecx,7 + mov esi,ebx + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,ecx + xor ecx,edx + add eax,ebx + vpalignr xmm8,xmm5,xmm4,8 + vpxor xmm6,xmm6,xmm2 + add ebp,DWORD[32+rsp] + and esi,ecx + xor ecx,edx + shrd ebx,ebx,7 + vpxor xmm6,xmm6,xmm7 + mov edi,eax + xor esi,ecx + vpaddd xmm9,xmm11,xmm5 + shld eax,eax,5 + add ebp,esi + vpxor xmm6,xmm6,xmm8 + xor edi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[36+rsp] + vpsrld xmm8,xmm6,30 + vmovdqa XMMWORD[16+rsp],xmm9 + and edi,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,ebp + vpslld xmm6,xmm6,2 + xor edi,ebx + shld ebp,ebp,5 + add edx,edi + xor esi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[40+rsp] + and esi,eax + vpor xmm6,xmm6,xmm8 + xor eax,ebx + shrd ebp,ebp,7 + mov edi,edx + xor esi,eax + shld edx,edx,5 + add ecx,esi + xor edi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[44+rsp] + and edi,ebp + xor ebp,eax + shrd edx,edx,7 + mov esi,ecx + xor edi,ebp + shld ecx,ecx,5 + add ebx,edi + xor esi,edx + xor edx,ebp + add ebx,ecx + vpalignr xmm8,xmm6,xmm5,8 + vpxor xmm7,xmm7,xmm3 + add eax,DWORD[48+rsp] + and esi,edx + xor edx,ebp + shrd ecx,ecx,7 + vpxor xmm7,xmm7,xmm0 + mov edi,ebx + xor esi,edx + vpaddd xmm9,xmm11,xmm6 + vmovdqa xmm11,XMMWORD[32+r11] + shld ebx,ebx,5 + add eax,esi + vpxor xmm7,xmm7,xmm8 + xor edi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[52+rsp] + vpsrld xmm8,xmm7,30 + vmovdqa XMMWORD[32+rsp],xmm9 + and edi,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + vpslld xmm7,xmm7,2 + xor edi,ecx + shld eax,eax,5 + add ebp,edi + xor esi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[56+rsp] + and esi,ebx + vpor xmm7,xmm7,xmm8 + xor ebx,ecx + shrd eax,eax,7 + mov edi,ebp + xor esi,ebx + shld ebp,ebp,5 + add edx,esi + xor edi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[60+rsp] + and edi,eax + xor eax,ebx + shrd ebp,ebp,7 + mov esi,edx + xor edi,eax + shld edx,edx,5 + add ecx,edi + xor esi,ebp + xor ebp,eax + add ecx,edx + vpalignr xmm8,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + add ebx,DWORD[rsp] + and esi,ebp + xor ebp,eax + shrd edx,edx,7 + vpxor xmm0,xmm0,xmm1 + mov edi,ecx + xor esi,ebp + vpaddd xmm9,xmm11,xmm7 + shld ecx,ecx,5 + add ebx,esi + vpxor xmm0,xmm0,xmm8 + xor edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[4+rsp] + vpsrld xmm8,xmm0,30 + vmovdqa XMMWORD[48+rsp],xmm9 + and edi,edx + xor edx,ebp + shrd ecx,ecx,7 + mov esi,ebx + vpslld xmm0,xmm0,2 + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[8+rsp] + and esi,ecx + vpor xmm0,xmm0,xmm8 + xor ecx,edx + shrd ebx,ebx,7 + mov edi,eax + xor esi,ecx + shld eax,eax,5 + add ebp,esi + xor edi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[12+rsp] + and edi,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,ebp + xor edi,ebx + shld ebp,ebp,5 + add edx,edi + xor esi,eax + xor eax,ebx + add edx,ebp + vpalignr xmm8,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ecx,DWORD[16+rsp] + and esi,eax + xor eax,ebx + shrd ebp,ebp,7 + vpxor xmm1,xmm1,xmm2 + mov edi,edx + xor esi,eax + vpaddd xmm9,xmm11,xmm0 + shld edx,edx,5 + add ecx,esi + vpxor xmm1,xmm1,xmm8 + xor edi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[20+rsp] + vpsrld xmm8,xmm1,30 + vmovdqa XMMWORD[rsp],xmm9 + and edi,ebp + xor ebp,eax + shrd edx,edx,7 + mov esi,ecx + vpslld xmm1,xmm1,2 + xor edi,ebp + shld ecx,ecx,5 + add ebx,edi + xor esi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[24+rsp] + and esi,edx + vpor xmm1,xmm1,xmm8 + xor edx,ebp + shrd ecx,ecx,7 + mov edi,ebx + xor esi,edx + shld ebx,ebx,5 + add eax,esi + xor edi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[28+rsp] + and edi,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + xor edi,ecx + shld eax,eax,5 + add ebp,edi + xor esi,ebx + xor ebx,ecx + add ebp,eax + vpalignr xmm8,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add edx,DWORD[32+rsp] + and esi,ebx + xor ebx,ecx + shrd eax,eax,7 + vpxor xmm2,xmm2,xmm3 + mov edi,ebp + xor esi,ebx + vpaddd xmm9,xmm11,xmm1 + shld ebp,ebp,5 + add edx,esi + vpxor xmm2,xmm2,xmm8 + xor edi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[36+rsp] + vpsrld xmm8,xmm2,30 + vmovdqa XMMWORD[16+rsp],xmm9 + and edi,eax + xor eax,ebx + shrd ebp,ebp,7 + mov esi,edx + vpslld xmm2,xmm2,2 + xor edi,eax + shld edx,edx,5 + add ecx,edi + xor esi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[40+rsp] + and esi,ebp + vpor xmm2,xmm2,xmm8 + xor ebp,eax + shrd edx,edx,7 + mov edi,ecx + xor esi,ebp + shld ecx,ecx,5 + add ebx,esi + xor edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[44+rsp] + and edi,edx + xor edx,ebp + shrd ecx,ecx,7 + mov esi,ebx + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + add eax,ebx + vpalignr xmm8,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebp,DWORD[48+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + vpxor xmm3,xmm3,xmm4 + add ebp,esi + xor edi,ecx + vpaddd xmm9,xmm11,xmm2 + shrd ebx,ebx,7 + add ebp,eax + vpxor xmm3,xmm3,xmm8 + add edx,DWORD[52+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + vpsrld xmm8,xmm3,30 + vmovdqa XMMWORD[32+rsp],xmm9 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpslld xmm3,xmm3,2 + add ecx,DWORD[56+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vpor xmm3,xmm3,xmm8 + add ebx,DWORD[60+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[rsp] + vpaddd xmm9,xmm11,xmm3 + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + vmovdqa XMMWORD[48+rsp],xmm9 + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[4+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[8+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[12+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + cmp r9,r10 + je NEAR $L$done_avx + vmovdqa xmm6,XMMWORD[64+r11] + vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqu xmm0,XMMWORD[r9] + vmovdqu xmm1,XMMWORD[16+r9] + vmovdqu xmm2,XMMWORD[32+r9] + vmovdqu xmm3,XMMWORD[48+r9] + vpshufb xmm0,xmm0,xmm6 + add r9,64 + add ebx,DWORD[16+rsp] + xor esi,ebp + vpshufb xmm1,xmm1,xmm6 + mov edi,ecx + shld ecx,ecx,5 + vpaddd xmm4,xmm0,xmm11 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + vmovdqa XMMWORD[rsp],xmm4 + add eax,DWORD[20+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[24+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[28+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[32+rsp] + xor esi,eax + vpshufb xmm2,xmm2,xmm6 + mov edi,edx + shld edx,edx,5 + vpaddd xmm5,xmm1,xmm11 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vmovdqa XMMWORD[16+rsp],xmm5 + add ebx,DWORD[36+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[40+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[44+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[48+rsp] + xor esi,ebx + vpshufb xmm3,xmm3,xmm6 + mov edi,ebp + shld ebp,ebp,5 + vpaddd xmm6,xmm2,xmm11 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + vmovdqa XMMWORD[32+rsp],xmm6 + add ecx,DWORD[52+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[56+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[60+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + shrd ecx,ecx,7 + add eax,ebx + add eax,DWORD[r8] + add esi,DWORD[4+r8] + add ecx,DWORD[8+r8] + add edx,DWORD[12+r8] + mov DWORD[r8],eax + add ebp,DWORD[16+r8] + mov DWORD[4+r8],esi + mov ebx,esi + mov DWORD[8+r8],ecx + mov edi,ecx + mov DWORD[12+r8],edx + xor edi,edx + mov DWORD[16+r8],ebp + and esi,edi + jmp NEAR $L$oop_avx + +ALIGN 16 +$L$done_avx: + add ebx,DWORD[16+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[20+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[24+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[28+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[32+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[36+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[40+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[44+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[48+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[52+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[56+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[60+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + shrd ecx,ecx,7 + add eax,ebx + vzeroupper + + add eax,DWORD[r8] + add esi,DWORD[4+r8] + add ecx,DWORD[8+r8] + mov DWORD[r8],eax + add edx,DWORD[12+r8] + mov DWORD[4+r8],esi + add ebp,DWORD[16+r8] + mov DWORD[8+r8],ecx + mov DWORD[12+r8],edx + mov DWORD[16+r8],ebp + movaps xmm6,XMMWORD[((-40-96))+r14] + movaps xmm7,XMMWORD[((-40-80))+r14] + movaps xmm8,XMMWORD[((-40-64))+r14] + movaps xmm9,XMMWORD[((-40-48))+r14] + movaps xmm10,XMMWORD[((-40-32))+r14] + movaps xmm11,XMMWORD[((-40-16))+r14] + lea rsi,[r14] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha1_block_data_order_avx: ALIGN 64 K_XX_XX: DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 @@ -2605,6 +3750,9 @@ ALIGN 4 DD $L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_sha1_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha1_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha1_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha1_block_data_order: @@ -2614,3 +3762,7 @@ $L$SEH_info_sha1_block_data_order_ssse3: DB 9,0,0,0 DD ssse3_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_sha1_block_data_order_avx: +DB 9,0,0,0 + DD ssse3_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha256-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/sha/sha256-x86_64.asm index e6193c5b9d..efaf9b55fc 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha256-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/sha/sha256-x86_64.asm @@ -23,6 +23,11 @@ $L$SEH_begin_sha256_block_data_order: mov r9d,DWORD[r11] mov r10d,DWORD[4+r11] mov r11d,DWORD[8+r11] + and r9d,1073741824 + and r10d,268435968 + or r10d,r9d + cmp r10d,1342177792 + je NEAR $L$avx_shortcut test r10d,512 jnz NEAR $L$ssse3_shortcut push rbx @@ -2877,6 +2882,1082 @@ $L$epilogue_ssse3: mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_sha256_block_data_order_ssse3: + +ALIGN 64 +sha256_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$avx_shortcut: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov r11,rsp + shl rdx,4 + sub rsp,160 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[((64+24))+rsp],r11 + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movaps XMMWORD[(64+64)+rsp],xmm8 + movaps XMMWORD[(64+80)+rsp],xmm9 +$L$prologue_avx: + + vzeroupper + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + vmovdqa xmm8,XMMWORD[((K256+512+32))] + vmovdqa xmm9,XMMWORD[((K256+512+64))] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm7,XMMWORD[((K256+512))] + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm0,xmm0,xmm7 + lea rbp,[K256] + vpshufb xmm1,xmm1,xmm7 + vpshufb xmm2,xmm2,xmm7 + vpaddd xmm4,xmm0,XMMWORD[rbp] + vpshufb xmm3,xmm3,xmm7 + vpaddd xmm5,xmm1,XMMWORD[32+rbp] + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + vpaddd xmm7,xmm3,XMMWORD[96+rbp] + vmovdqa XMMWORD[rsp],xmm4 + mov r14d,eax + vmovdqa XMMWORD[16+rsp],xmm5 + mov edi,ebx + vmovdqa XMMWORD[32+rsp],xmm6 + xor edi,ecx + vmovdqa XMMWORD[48+rsp],xmm7 + mov r13d,r8d + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + sub rbp,-128 + vpalignr xmm4,xmm1,xmm0,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm3,xmm2,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm0,xmm0,xmm7 + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm3,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm0,xmm0,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm0,xmm0,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + vpshufd xmm7,xmm0,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm0,xmm0,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm0,XMMWORD[rbp] + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[rsp],xmm6 + vpalignr xmm4,xmm2,xmm1,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm0,xmm3,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm1,xmm1,xmm7 + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm0,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm1,xmm1,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm1,xmm1,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + vpshufd xmm7,xmm1,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm1,xmm1,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm1,XMMWORD[32+rbp] + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[16+rsp],xmm6 + vpalignr xmm4,xmm3,xmm2,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm1,xmm0,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm2,xmm2,xmm7 + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm1,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm2,xmm2,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm2,xmm2,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + vpshufd xmm7,xmm2,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm2,xmm2,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[32+rsp],xmm6 + vpalignr xmm4,xmm0,xmm3,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm2,xmm1,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm3,xmm3,xmm7 + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm2,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm3,xmm3,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm3,xmm3,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + vpshufd xmm7,xmm3,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm3,xmm3,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm3,XMMWORD[96+rbp] + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[48+rsp],xmm6 + cmp BYTE[131+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD[((64+0))+rsp] + mov eax,r14d + + add eax,DWORD[rdi] + lea rsi,[64+rsi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop_avx + + mov rsi,QWORD[((64+24))+rsp] + vzeroupper + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + movaps xmm8,XMMWORD[((64+64))+rsp] + movaps xmm9,XMMWORD[((64+80))+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha256_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -2982,6 +4063,9 @@ ALIGN 4 DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha256_block_data_order: @@ -2992,3 +4076,7 @@ $L$SEH_info_sha256_block_data_order_ssse3: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_sha256_block_data_order_avx: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha512-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/sha/sha512-x86_64.asm index b76cc0edb9..71449cd24f 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha512-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/sha/sha512-x86_64.asm @@ -19,6 +19,17 @@ $L$SEH_begin_sha512_block_data_order: mov rdx,r8 + lea r11,[OPENSSL_ia32cap_P] + mov r9d,DWORD[r11] + mov r10d,DWORD[4+r11] + mov r11d,DWORD[8+r11] + test r10d,2048 + jnz NEAR $L$xop_shortcut + and r9d,1073741824 + and r10d,268435968 + or r10d,r9d + cmp r10d,1342177792 + je NEAR $L$avx_shortcut push rbx push rbp push r12 @@ -1801,6 +1812,2282 @@ DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 DB 111,114,103,62,0 + +ALIGN 64 +sha512_block_data_order_xop: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha512_block_data_order_xop: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$xop_shortcut: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov r11,rsp + shl rdx,4 + sub rsp,256 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[((128+24))+rsp],r11 + movaps XMMWORD[(128+32)+rsp],xmm6 + movaps XMMWORD[(128+48)+rsp],xmm7 + movaps XMMWORD[(128+64)+rsp],xmm8 + movaps XMMWORD[(128+80)+rsp],xmm9 + movaps XMMWORD[(128+96)+rsp],xmm10 + movaps XMMWORD[(128+112)+rsp],xmm11 +$L$prologue_xop: + + vzeroupper + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop_xop +ALIGN 16 +$L$loop_xop: + vmovdqa xmm11,XMMWORD[((K512+1280))] + vmovdqu xmm0,XMMWORD[rsi] + lea rbp,[((K512+128))] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vpshufb xmm0,xmm0,xmm11 + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm1,xmm1,xmm11 + vmovdqu xmm4,XMMWORD[64+rsi] + vpshufb xmm2,xmm2,xmm11 + vmovdqu xmm5,XMMWORD[80+rsi] + vpshufb xmm3,xmm3,xmm11 + vmovdqu xmm6,XMMWORD[96+rsi] + vpshufb xmm4,xmm4,xmm11 + vmovdqu xmm7,XMMWORD[112+rsi] + vpshufb xmm5,xmm5,xmm11 + vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] + vpshufb xmm6,xmm6,xmm11 + vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] + vpshufb xmm7,xmm7,xmm11 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] + vmovdqa XMMWORD[rsp],xmm8 + vpaddq xmm8,xmm4,XMMWORD[rbp] + vmovdqa XMMWORD[16+rsp],xmm9 + vpaddq xmm9,xmm5,XMMWORD[32+rbp] + vmovdqa XMMWORD[32+rsp],xmm10 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + vmovdqa XMMWORD[48+rsp],xmm11 + vpaddq xmm11,xmm7,XMMWORD[96+rbp] + vmovdqa XMMWORD[64+rsp],xmm8 + mov r14,rax + vmovdqa XMMWORD[80+rsp],xmm9 + mov rdi,rbx + vmovdqa XMMWORD[96+rsp],xmm10 + xor rdi,rcx + vmovdqa XMMWORD[112+rsp],xmm11 + mov r13,r8 + jmp NEAR $L$xop_00_47 + +ALIGN 16 +$L$xop_00_47: + add rbp,256 + vpalignr xmm8,xmm1,xmm0,8 + ror r13,23 + mov rax,r14 + vpalignr xmm11,xmm5,xmm4,8 + mov r12,r9 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r8 + xor r12,r10 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rax + vpaddq xmm0,xmm0,xmm11 + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax +DB 143,72,120,195,209,7 + xor r12,r10 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,223,3 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + ror r14,28 + vpsrlq xmm10,xmm7,6 + add rdx,r11 + add r11,rdi + vpaddq xmm0,xmm0,xmm8 + mov r13,rdx + add r14,r11 +DB 143,72,120,195,203,42 + ror r13,23 + mov r11,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + vpaddq xmm0,xmm0,xmm11 + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[rsp],xmm10 + vpalignr xmm8,xmm2,xmm1,8 + ror r13,23 + mov r10,r14 + vpalignr xmm11,xmm6,xmm5,8 + mov r12,rdx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rcx + xor r12,r8 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r10 + vpaddq xmm1,xmm1,xmm11 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 +DB 143,72,120,195,209,7 + xor r12,r8 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,216,3 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + ror r14,28 + vpsrlq xmm10,xmm0,6 + add rbx,r9 + add r9,rdi + vpaddq xmm1,xmm1,xmm8 + mov r13,rbx + add r14,r9 +DB 143,72,120,195,203,42 + ror r13,23 + mov r9,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + vpaddq xmm1,xmm1,xmm11 + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[16+rsp],xmm10 + vpalignr xmm8,xmm3,xmm2,8 + ror r13,23 + mov r8,r14 + vpalignr xmm11,xmm7,xmm6,8 + mov r12,rbx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rax + xor r12,rcx + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r8 + vpaddq xmm2,xmm2,xmm11 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 +DB 143,72,120,195,209,7 + xor r12,rcx + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,217,3 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + ror r14,28 + vpsrlq xmm10,xmm1,6 + add r11,rdx + add rdx,rdi + vpaddq xmm2,xmm2,xmm8 + mov r13,r11 + add r14,rdx +DB 143,72,120,195,203,42 + ror r13,23 + mov rdx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + vpaddq xmm2,xmm2,xmm11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[32+rsp],xmm10 + vpalignr xmm8,xmm4,xmm3,8 + ror r13,23 + mov rcx,r14 + vpalignr xmm11,xmm0,xmm7,8 + mov r12,r11 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r10 + xor r12,rax + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rcx + vpaddq xmm3,xmm3,xmm11 + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx +DB 143,72,120,195,209,7 + xor r12,rax + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,218,3 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + ror r14,28 + vpsrlq xmm10,xmm2,6 + add r9,rbx + add rbx,rdi + vpaddq xmm3,xmm3,xmm8 + mov r13,r9 + add r14,rbx +DB 143,72,120,195,203,42 + ror r13,23 + mov rbx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + vpaddq xmm3,xmm3,xmm11 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[48+rsp],xmm10 + vpalignr xmm8,xmm5,xmm4,8 + ror r13,23 + mov rax,r14 + vpalignr xmm11,xmm1,xmm0,8 + mov r12,r9 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r8 + xor r12,r10 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rax + vpaddq xmm4,xmm4,xmm11 + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax +DB 143,72,120,195,209,7 + xor r12,r10 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,219,3 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + ror r14,28 + vpsrlq xmm10,xmm3,6 + add rdx,r11 + add r11,rdi + vpaddq xmm4,xmm4,xmm8 + mov r13,rdx + add r14,r11 +DB 143,72,120,195,203,42 + ror r13,23 + mov r11,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + vpaddq xmm4,xmm4,xmm11 + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + vpaddq xmm10,xmm4,XMMWORD[rbp] + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[64+rsp],xmm10 + vpalignr xmm8,xmm6,xmm5,8 + ror r13,23 + mov r10,r14 + vpalignr xmm11,xmm2,xmm1,8 + mov r12,rdx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rcx + xor r12,r8 + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r10 + vpaddq xmm5,xmm5,xmm11 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 +DB 143,72,120,195,209,7 + xor r12,r8 + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,220,3 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + ror r14,28 + vpsrlq xmm10,xmm4,6 + add rbx,r9 + add r9,rdi + vpaddq xmm5,xmm5,xmm8 + mov r13,rbx + add r14,r9 +DB 143,72,120,195,203,42 + ror r13,23 + mov r9,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + vpaddq xmm5,xmm5,xmm11 + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + vpaddq xmm10,xmm5,XMMWORD[32+rbp] + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[80+rsp],xmm10 + vpalignr xmm8,xmm7,xmm6,8 + ror r13,23 + mov r8,r14 + vpalignr xmm11,xmm3,xmm2,8 + mov r12,rbx + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,rax + xor r12,rcx + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,r8 + vpaddq xmm6,xmm6,xmm11 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 +DB 143,72,120,195,209,7 + xor r12,rcx + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,221,3 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + ror r14,28 + vpsrlq xmm10,xmm5,6 + add r11,rdx + add rdx,rdi + vpaddq xmm6,xmm6,xmm8 + mov r13,r11 + add r14,rdx +DB 143,72,120,195,203,42 + ror r13,23 + mov rdx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + vpaddq xmm6,xmm6,xmm11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[96+rsp],xmm10 + vpalignr xmm8,xmm0,xmm7,8 + ror r13,23 + mov rcx,r14 + vpalignr xmm11,xmm4,xmm3,8 + mov r12,r11 + ror r14,5 +DB 143,72,120,195,200,56 + xor r13,r10 + xor r12,rax + vpsrlq xmm8,xmm8,7 + ror r13,4 + xor r14,rcx + vpaddq xmm7,xmm7,xmm11 + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx +DB 143,72,120,195,209,7 + xor r12,rax + ror r14,6 + vpxor xmm8,xmm8,xmm9 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 +DB 143,104,120,195,222,3 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + ror r14,28 + vpsrlq xmm10,xmm6,6 + add r9,rbx + add rbx,rdi + vpaddq xmm7,xmm7,xmm8 + mov r13,r9 + add r14,rbx +DB 143,72,120,195,203,42 + ror r13,23 + mov rbx,r14 + vpxor xmm11,xmm11,xmm10 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm9 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + vpaddq xmm7,xmm7,xmm11 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + vpaddq xmm10,xmm7,XMMWORD[96+rbp] + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[112+rsp],xmm10 + cmp BYTE[135+rbp],0 + jne NEAR $L$xop_00_47 + ror r13,23 + mov rax,r14 + mov r12,r9 + ror r14,5 + xor r13,r8 + xor r12,r10 + ror r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax + xor r12,r10 + ror r14,6 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + ror r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + ror r13,23 + mov r11,r14 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + ror r13,23 + mov r10,r14 + mov r12,rdx + ror r14,5 + xor r13,rcx + xor r12,r8 + ror r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 + xor r12,r8 + ror r14,6 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + ror r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + ror r13,23 + mov r9,r14 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + ror r13,23 + mov r8,r14 + mov r12,rbx + ror r14,5 + xor r13,rax + xor r12,rcx + ror r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 + xor r12,rcx + ror r14,6 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + ror r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + ror r13,23 + mov rdx,r14 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + ror r13,23 + mov rcx,r14 + mov r12,r11 + ror r14,5 + xor r13,r10 + xor r12,rax + ror r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx + xor r12,rax + ror r14,6 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + ror r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + ror r13,23 + mov rbx,r14 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + ror r13,23 + mov rax,r14 + mov r12,r9 + ror r14,5 + xor r13,r8 + xor r12,r10 + ror r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax + xor r12,r10 + ror r14,6 + xor r15,rbx + add r11,r12 + ror r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + ror r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + ror r13,23 + mov r11,r14 + mov r12,r8 + ror r14,5 + xor r13,rdx + xor r12,r9 + ror r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + ror r14,6 + xor rdi,rax + add r10,r12 + ror r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + ror r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + ror r13,23 + mov r10,r14 + mov r12,rdx + ror r14,5 + xor r13,rcx + xor r12,r8 + ror r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 + xor r12,r8 + ror r14,6 + xor r15,r11 + add r9,r12 + ror r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + ror r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + ror r13,23 + mov r9,r14 + mov r12,rcx + ror r14,5 + xor r13,rbx + xor r12,rdx + ror r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + ror r14,6 + xor rdi,r10 + add r8,r12 + ror r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + ror r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + ror r13,23 + mov r8,r14 + mov r12,rbx + ror r14,5 + xor r13,rax + xor r12,rcx + ror r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 + xor r12,rcx + ror r14,6 + xor r15,r9 + add rdx,r12 + ror r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + ror r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + ror r13,23 + mov rdx,r14 + mov r12,rax + ror r14,5 + xor r13,r11 + xor r12,rbx + ror r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + ror r14,6 + xor rdi,r8 + add rcx,r12 + ror r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + ror r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + ror r13,23 + mov rcx,r14 + mov r12,r11 + ror r14,5 + xor r13,r10 + xor r12,rax + ror r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx + xor r12,rax + ror r14,6 + xor r15,rdx + add rbx,r12 + ror r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + ror r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + ror r13,23 + mov rbx,r14 + mov r12,r10 + ror r14,5 + xor r13,r9 + xor r12,r11 + ror r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + ror r14,6 + xor rdi,rcx + add rax,r12 + ror r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + ror r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + mov rdi,QWORD[((128+0))+rsp] + mov rax,r14 + + add rax,QWORD[rdi] + lea rsi,[128+rsi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop_xop + + mov rsi,QWORD[((128+24))+rsp] + vzeroupper + movaps xmm6,XMMWORD[((128+32))+rsp] + movaps xmm7,XMMWORD[((128+48))+rsp] + movaps xmm8,XMMWORD[((128+64))+rsp] + movaps xmm9,XMMWORD[((128+80))+rsp] + movaps xmm10,XMMWORD[((128+96))+rsp] + movaps xmm11,XMMWORD[((128+112))+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] +$L$epilogue_xop: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha512_block_data_order_xop: + +ALIGN 64 +sha512_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha512_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + +$L$avx_shortcut: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + mov r11,rsp + shl rdx,4 + sub rsp,256 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[((128+24))+rsp],r11 + movaps XMMWORD[(128+32)+rsp],xmm6 + movaps XMMWORD[(128+48)+rsp],xmm7 + movaps XMMWORD[(128+64)+rsp],xmm8 + movaps XMMWORD[(128+80)+rsp],xmm9 + movaps XMMWORD[(128+96)+rsp],xmm10 + movaps XMMWORD[(128+112)+rsp],xmm11 +$L$prologue_avx: + + vzeroupper + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm11,XMMWORD[((K512+1280))] + vmovdqu xmm0,XMMWORD[rsi] + lea rbp,[((K512+128))] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vpshufb xmm0,xmm0,xmm11 + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm1,xmm1,xmm11 + vmovdqu xmm4,XMMWORD[64+rsi] + vpshufb xmm2,xmm2,xmm11 + vmovdqu xmm5,XMMWORD[80+rsi] + vpshufb xmm3,xmm3,xmm11 + vmovdqu xmm6,XMMWORD[96+rsi] + vpshufb xmm4,xmm4,xmm11 + vmovdqu xmm7,XMMWORD[112+rsi] + vpshufb xmm5,xmm5,xmm11 + vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] + vpshufb xmm6,xmm6,xmm11 + vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] + vpshufb xmm7,xmm7,xmm11 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] + vmovdqa XMMWORD[rsp],xmm8 + vpaddq xmm8,xmm4,XMMWORD[rbp] + vmovdqa XMMWORD[16+rsp],xmm9 + vpaddq xmm9,xmm5,XMMWORD[32+rbp] + vmovdqa XMMWORD[32+rsp],xmm10 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + vmovdqa XMMWORD[48+rsp],xmm11 + vpaddq xmm11,xmm7,XMMWORD[96+rbp] + vmovdqa XMMWORD[64+rsp],xmm8 + mov r14,rax + vmovdqa XMMWORD[80+rsp],xmm9 + mov rdi,rbx + vmovdqa XMMWORD[96+rsp],xmm10 + xor rdi,rcx + vmovdqa XMMWORD[112+rsp],xmm11 + mov r13,r8 + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + add rbp,256 + vpalignr xmm8,xmm1,xmm0,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm5,xmm4,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm0,xmm0,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm7,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm7,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm0,xmm0,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm7,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[8+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm0,xmm0,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[rsp],xmm10 + vpalignr xmm8,xmm2,xmm1,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm6,xmm5,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm1,xmm1,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[16+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm0,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm0,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm1,xmm1,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm0,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[24+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm1,xmm1,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[16+rsp],xmm10 + vpalignr xmm8,xmm3,xmm2,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm7,xmm6,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm2,xmm2,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[32+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm1,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm1,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm2,xmm2,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm1,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[40+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm2,xmm2,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[32+rsp],xmm10 + vpalignr xmm8,xmm4,xmm3,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm0,xmm7,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm3,xmm3,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[48+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm2,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm2,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm3,xmm3,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm2,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[56+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm3,xmm3,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[48+rsp],xmm10 + vpalignr xmm8,xmm5,xmm4,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm1,xmm0,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm4,xmm4,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[64+rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm3,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm3,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm4,xmm4,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm3,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[72+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm4,xmm4,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm4,XMMWORD[rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[64+rsp],xmm10 + vpalignr xmm8,xmm6,xmm5,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm2,xmm1,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm5,xmm5,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[80+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm4,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm4,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm5,xmm5,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm4,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[88+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm5,xmm5,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm5,XMMWORD[32+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[80+rsp],xmm10 + vpalignr xmm8,xmm7,xmm6,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm3,xmm2,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm6,xmm6,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[96+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm5,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm5,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm6,xmm6,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm5,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[104+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm6,xmm6,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[96+rsp],xmm10 + vpalignr xmm8,xmm0,xmm7,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm4,xmm3,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm7,xmm7,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[112+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm6,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm6,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm7,xmm7,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm6,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[120+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm7,xmm7,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm7,XMMWORD[96+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[112+rsp],xmm10 + cmp BYTE[135+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + mov rdi,QWORD[((128+0))+rsp] + mov rax,r14 + + add rax,QWORD[rdi] + lea rsi,[128+rsi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop_avx + + mov rsi,QWORD[((128+24))+rsp] + vzeroupper + movaps xmm6,XMMWORD[((128+32))+rsp] + movaps xmm7,XMMWORD[((128+48))+rsp] + movaps xmm8,XMMWORD[((128+64))+rsp] + movaps xmm9,XMMWORD[((128+80))+rsp] + movaps xmm10,XMMWORD[((128+96))+rsp] + movaps xmm11,XMMWORD[((128+112))+rsp] + mov r15,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r13,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + mov rbp,QWORD[32+rsi] + mov rbx,QWORD[40+rsi] + lea rsp,[48+rsi] +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_sha512_block_data_order_avx: EXTERN __imp_RtlVirtualUnwind ALIGN 16 @@ -1903,9 +4190,23 @@ ALIGN 4 DD $L$SEH_begin_sha512_block_data_order wrt ..imagebase DD $L$SEH_end_sha512_block_data_order wrt ..imagebase DD $L$SEH_info_sha512_block_data_order wrt ..imagebase + DD $L$SEH_begin_sha512_block_data_order_xop wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_xop wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_xop wrt ..imagebase + DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_sha512_block_data_order: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase +$L$SEH_info_sha512_block_data_order_xop: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_xop wrt ..imagebase,$L$epilogue_xop wrt ..imagebase +$L$SEH_info_sha512_block_data_order_avx: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase diff --git a/packager/third_party/curl/curl.gyp b/packager/third_party/curl/curl.gyp index eec6da48d8..c82a98514b 100644 --- a/packager/third_party/curl/curl.gyp +++ b/packager/third_party/curl/curl.gyp @@ -108,14 +108,14 @@ 'source/lib/content_encoding.c', 'source/lib/cookie.c', 'source/lib/curl_addrinfo.c', + 'source/lib/curl_des.c', + 'source/lib/curl_endian.c', 'source/lib/curl_fnmatch.c', 'source/lib/curl_gethostname.c', 'source/lib/curl_gssapi.c', 'source/lib/curl_memrchr.c', 'source/lib/curl_multibyte.c', - 'source/lib/curl_ntlm.c', 'source/lib/curl_ntlm_core.c', - 'source/lib/curl_ntlm_msgs.c', 'source/lib/curl_ntlm_wb.c', 'source/lib/curl_rtmp.c', 'source/lib/curl_sasl.c', @@ -137,21 +137,23 @@ 'source/lib/hmac.c', 'source/lib/hostasyn.c', 'source/lib/hostcheck.c', - 'source/lib/hostip.c', 'source/lib/hostip4.c', 'source/lib/hostip6.c', + 'source/lib/hostip.c', 'source/lib/hostsyn.c', + 'source/lib/http2.c', 'source/lib/http.c', 'source/lib/http_chunks.c', 'source/lib/http_digest.c', 'source/lib/http_negotiate.c', - 'source/lib/http_negotiate_sspi.c', + 'source/lib/http_ntlm.c', 'source/lib/http_proxy.c', 'source/lib/idn_win32.c', 'source/lib/if2ip.c', 'source/lib/imap.c', 'source/lib/inet_ntop.c', 'source/lib/inet_pton.c', + 'source/lib/krb5.c', 'source/lib/ldap.c', 'source/lib/llist.c', 'source/lib/md4.c', @@ -162,6 +164,8 @@ 'source/lib/netrc.c', 'source/lib/non-ascii.c', 'source/lib/nonblock.c', + 'source/lib/nwlib.c', + 'source/lib/nwos.c', 'source/lib/openldap.c', 'source/lib/parsedate.c', 'source/lib/pingpong.c', @@ -175,6 +179,7 @@ 'source/lib/sendf.c', 'source/lib/share.c', 'source/lib/slist.c', + 'source/lib/smb.c', 'source/lib/smtp.c', 'source/lib/socks.c', 'source/lib/socks_gssapi.c', @@ -187,17 +192,31 @@ 'source/lib/strerror.c', 'source/lib/strtok.c', 'source/lib/strtoofft.c', + 'source/lib/system_win32.c', 'source/lib/telnet.c', 'source/lib/tftp.c', 'source/lib/timeval.c', 'source/lib/transfer.c', 'source/lib/url.c', + 'source/lib/vauth/cleartext.c', + 'source/lib/vauth/cram.c', + 'source/lib/vauth/digest.c', + 'source/lib/vauth/digest_sspi.c', + 'source/lib/vauth/krb5_gssapi.c', + 'source/lib/vauth/krb5_sspi.c', + 'source/lib/vauth/ntlm.c', + 'source/lib/vauth/ntlm_sspi.c', + 'source/lib/vauth/oauth2.c', + 'source/lib/vauth/spnego_gssapi.c', + 'source/lib/vauth/spnego_sspi.c', + 'source/lib/vauth/vauth.c', 'source/lib/version.c', 'source/lib/vtls/axtls.c', 'source/lib/vtls/cyassl.c', 'source/lib/vtls/darwinssl.c', 'source/lib/vtls/gskit.c', 'source/lib/vtls/gtls.c', + 'source/lib/vtls/mbedtls.c', 'source/lib/vtls/nss.c', 'source/lib/vtls/openssl.c', 'source/lib/vtls/polarssl.c', @@ -206,6 +225,7 @@ 'source/lib/vtls/vtls.c', 'source/lib/warnless.c', 'source/lib/wildcard.c', + 'source/lib/x509asn1.c', ], }, {