diff --git a/DEPS b/DEPS index a11b86667c..5500080e0a 100644 --- a/DEPS +++ b/DEPS @@ -36,7 +36,7 @@ deps = { # Make sure the version matches the one in # src/packager/third_party/boringssl, which contains perl generated files. "src/packager/third_party/boringssl/src": - Var("github") + "/google/boringssl@3cab5572b1fcf5a8f6018529dc30dc8d21b2a4bd", + Var("github") + "/google/boringssl@fc9c67599d9bdeb2e0467085133b81a8e28f77a4", "src/packager/third_party/curl/source": Var("github") + "/curl/curl@79e63a53bb9598af863b0afe49ad662795faeef4", #7.50.0 diff --git a/packager/third_party/boringssl/BUILD.generated.gni b/packager/third_party/boringssl/BUILD.generated.gni index cdbe0c95ab..f59afac5c3 100644 --- a/packager/third_party/boringssl/BUILD.generated.gni +++ b/packager/third_party/boringssl/BUILD.generated.gni @@ -6,11 +6,8 @@ crypto_sources = [ "err_data.c", - "src/crypto/aes/aes.c", - "src/crypto/aes/mode_wrappers.c", "src/crypto/asn1/a_bitstr.c", "src/crypto/asn1/a_bool.c", - "src/crypto/asn1/a_bytes.c", "src/crypto/asn1/a_d2i_fp.c", "src/crypto/asn1/a_dup.c", "src/crypto/asn1/a_enum.c", @@ -27,135 +24,116 @@ crypto_sources = [ "src/crypto/asn1/a_utctm.c", "src/crypto/asn1/a_utf8.c", "src/crypto/asn1/asn1_lib.c", + "src/crypto/asn1/asn1_locl.h", "src/crypto/asn1/asn1_par.c", "src/crypto/asn1/asn_pack.c", "src/crypto/asn1/f_enum.c", "src/crypto/asn1/f_int.c", "src/crypto/asn1/f_string.c", - "src/crypto/asn1/t_bitst.c", "src/crypto/asn1/tasn_dec.c", "src/crypto/asn1/tasn_enc.c", "src/crypto/asn1/tasn_fre.c", "src/crypto/asn1/tasn_new.c", "src/crypto/asn1/tasn_typ.c", "src/crypto/asn1/tasn_utl.c", - "src/crypto/asn1/x_bignum.c", - "src/crypto/asn1/x_long.c", + "src/crypto/asn1/time_support.c", "src/crypto/base64/base64.c", "src/crypto/bio/bio.c", "src/crypto/bio/bio_mem.c", - "src/crypto/bio/buffer.c", "src/crypto/bio/connect.c", "src/crypto/bio/fd.c", "src/crypto/bio/file.c", "src/crypto/bio/hexdump.c", + "src/crypto/bio/internal.h", "src/crypto/bio/pair.c", "src/crypto/bio/printf.c", "src/crypto/bio/socket.c", "src/crypto/bio/socket_helper.c", - "src/crypto/bn/add.c", - "src/crypto/bn/asm/x86_64-gcc.c", - "src/crypto/bn/bn.c", - "src/crypto/bn/bn_asn1.c", - "src/crypto/bn/cmp.c", - "src/crypto/bn/convert.c", - "src/crypto/bn/ctx.c", - "src/crypto/bn/div.c", - "src/crypto/bn/exponentiation.c", - "src/crypto/bn/gcd.c", - "src/crypto/bn/generic.c", - "src/crypto/bn/kronecker.c", - "src/crypto/bn/montgomery.c", - "src/crypto/bn/mul.c", - "src/crypto/bn/prime.c", - "src/crypto/bn/random.c", - "src/crypto/bn/rsaz_exp.c", - "src/crypto/bn/shift.c", - "src/crypto/bn/sqrt.c", + "src/crypto/bn_extra/bn_asn1.c", + "src/crypto/bn_extra/convert.c", "src/crypto/buf/buf.c", "src/crypto/bytestring/asn1_compat.c", "src/crypto/bytestring/ber.c", "src/crypto/bytestring/cbb.c", "src/crypto/bytestring/cbs.c", + "src/crypto/bytestring/internal.h", "src/crypto/chacha/chacha.c", - "src/crypto/cipher/aead.c", - "src/crypto/cipher/cipher.c", - "src/crypto/cipher/derive_key.c", - "src/crypto/cipher/e_aes.c", - "src/crypto/cipher/e_chacha20poly1305.c", - "src/crypto/cipher/e_des.c", - "src/crypto/cipher/e_null.c", - "src/crypto/cipher/e_rc2.c", - "src/crypto/cipher/e_rc4.c", - "src/crypto/cipher/e_ssl3.c", - "src/crypto/cipher/e_tls.c", - "src/crypto/cipher/tls_cbc.c", + "src/crypto/cipher_extra/cipher_extra.c", + "src/crypto/cipher_extra/derive_key.c", + "src/crypto/cipher_extra/e_aesctrhmac.c", + "src/crypto/cipher_extra/e_aesgcmsiv.c", + "src/crypto/cipher_extra/e_chacha20poly1305.c", + "src/crypto/cipher_extra/e_null.c", + "src/crypto/cipher_extra/e_rc2.c", + "src/crypto/cipher_extra/e_rc4.c", + "src/crypto/cipher_extra/e_ssl3.c", + "src/crypto/cipher_extra/e_tls.c", + "src/crypto/cipher_extra/internal.h", + "src/crypto/cipher_extra/tls_cbc.c", "src/crypto/cmac/cmac.c", "src/crypto/conf/conf.c", + "src/crypto/conf/conf_def.h", + "src/crypto/conf/internal.h", "src/crypto/cpu-aarch64-linux.c", "src/crypto/cpu-arm-linux.c", "src/crypto/cpu-arm.c", "src/crypto/cpu-intel.c", + "src/crypto/cpu-ppc64le.c", "src/crypto/crypto.c", - "src/crypto/curve25519/curve25519.c", "src/crypto/curve25519/spake25519.c", "src/crypto/curve25519/x25519-x86_64.c", - "src/crypto/des/des.c", "src/crypto/dh/check.c", "src/crypto/dh/dh.c", "src/crypto/dh/dh_asn1.c", "src/crypto/dh/params.c", - "src/crypto/digest/digest.c", - "src/crypto/digest/digests.c", + "src/crypto/digest_extra/digest_extra.c", "src/crypto/dsa/dsa.c", "src/crypto/dsa/dsa_asn1.c", - "src/crypto/ec/ec.c", - "src/crypto/ec/ec_asn1.c", - "src/crypto/ec/ec_key.c", - "src/crypto/ec/ec_montgomery.c", - "src/crypto/ec/oct.c", - "src/crypto/ec/p224-64.c", - "src/crypto/ec/p256-64.c", - "src/crypto/ec/p256-x86_64.c", - "src/crypto/ec/simple.c", - "src/crypto/ec/util-64.c", - "src/crypto/ec/wnaf.c", + "src/crypto/ec_extra/ec_asn1.c", "src/crypto/ecdh/ecdh.c", - "src/crypto/ecdsa/ecdsa.c", - "src/crypto/ecdsa/ecdsa_asn1.c", + "src/crypto/ecdsa_extra/ecdsa_asn1.c", "src/crypto/engine/engine.c", "src/crypto/err/err.c", + "src/crypto/err/internal.h", "src/crypto/evp/digestsign.c", "src/crypto/evp/evp.c", "src/crypto/evp/evp_asn1.c", "src/crypto/evp/evp_ctx.c", + "src/crypto/evp/internal.h", "src/crypto/evp/p_dsa_asn1.c", "src/crypto/evp/p_ec.c", "src/crypto/evp/p_ec_asn1.c", + "src/crypto/evp/p_ed25519.c", + "src/crypto/evp/p_ed25519_asn1.c", "src/crypto/evp/p_rsa.c", "src/crypto/evp/p_rsa_asn1.c", "src/crypto/evp/pbkdf.c", "src/crypto/evp/print.c", + "src/crypto/evp/scrypt.c", "src/crypto/evp/sign.c", "src/crypto/ex_data.c", + "src/crypto/fipsmodule/aes/internal.h", + "src/crypto/fipsmodule/bcm.c", + "src/crypto/fipsmodule/bn/internal.h", + "src/crypto/fipsmodule/bn/rsaz_exp.h", + "src/crypto/fipsmodule/cipher/internal.h", + "src/crypto/fipsmodule/delocate.h", + "src/crypto/fipsmodule/des/internal.h", + "src/crypto/fipsmodule/digest/internal.h", + "src/crypto/fipsmodule/digest/md32_common.h", + "src/crypto/fipsmodule/ec/internal.h", + "src/crypto/fipsmodule/ec/p256-x86_64-table.h", + "src/crypto/fipsmodule/ec/p256-x86_64.h", + "src/crypto/fipsmodule/is_fips.c", + "src/crypto/fipsmodule/modes/internal.h", + "src/crypto/fipsmodule/rand/internal.h", + "src/crypto/fipsmodule/rsa/internal.h", "src/crypto/hkdf/hkdf.c", - "src/crypto/hmac/hmac.c", + "src/crypto/internal.h", "src/crypto/lhash/lhash.c", - "src/crypto/md4/md4.c", - "src/crypto/md5/md5.c", "src/crypto/mem.c", - "src/crypto/modes/cbc.c", - "src/crypto/modes/cfb.c", - "src/crypto/modes/ctr.c", - "src/crypto/modes/gcm.c", - "src/crypto/modes/ofb.c", - "src/crypto/newhope/error_correction.c", - "src/crypto/newhope/newhope.c", - "src/crypto/newhope/ntt.c", - "src/crypto/newhope/poly.c", - "src/crypto/newhope/precomp.c", - "src/crypto/newhope/reduce.c", "src/crypto/obj/obj.c", + "src/crypto/obj/obj_dat.h", "src/crypto/obj/obj_xref.c", "src/crypto/pem/pem_all.c", "src/crypto/pem/pem_info.c", @@ -165,34 +143,33 @@ crypto_sources = [ "src/crypto/pem/pem_pkey.c", "src/crypto/pem/pem_x509.c", "src/crypto/pem/pem_xaux.c", - "src/crypto/pkcs8/p5_pbe.c", + "src/crypto/pkcs7/internal.h", + "src/crypto/pkcs7/pkcs7.c", + "src/crypto/pkcs7/pkcs7_x509.c", + "src/crypto/pkcs8/internal.h", "src/crypto/pkcs8/p5_pbev2.c", - "src/crypto/pkcs8/p8_pkey.c", "src/crypto/pkcs8/pkcs8.c", + "src/crypto/pkcs8/pkcs8_x509.c", + "src/crypto/poly1305/internal.h", "src/crypto/poly1305/poly1305.c", "src/crypto/poly1305/poly1305_arm.c", "src/crypto/poly1305/poly1305_vec.c", - "src/crypto/rand/deterministic.c", - "src/crypto/rand/rand.c", - "src/crypto/rand/urandom.c", - "src/crypto/rand/windows.c", + "src/crypto/pool/internal.h", + "src/crypto/pool/pool.c", + "src/crypto/rand_extra/deterministic.c", + "src/crypto/rand_extra/forkunsafe.c", + "src/crypto/rand_extra/fuchsia.c", + "src/crypto/rand_extra/rand_extra.c", + "src/crypto/rand_extra/windows.c", "src/crypto/rc4/rc4.c", "src/crypto/refcount_c11.c", "src/crypto/refcount_lock.c", - "src/crypto/rsa/blinding.c", - "src/crypto/rsa/padding.c", - "src/crypto/rsa/rsa.c", - "src/crypto/rsa/rsa_asn1.c", - "src/crypto/rsa/rsa_impl.c", - "src/crypto/sha/sha1.c", - "src/crypto/sha/sha256.c", - "src/crypto/sha/sha512.c", + "src/crypto/rsa_extra/rsa_asn1.c", "src/crypto/stack/stack.c", "src/crypto/thread.c", "src/crypto/thread_none.c", "src/crypto/thread_pthread.c", "src/crypto/thread_win.c", - "src/crypto/time_support.c", "src/crypto/x509/a_digest.c", "src/crypto/x509/a_sign.c", "src/crypto/x509/a_strex.c", @@ -201,13 +178,15 @@ crypto_sources = [ "src/crypto/x509/asn1_gen.c", "src/crypto/x509/by_dir.c", "src/crypto/x509/by_file.c", + "src/crypto/x509/charmap.h", "src/crypto/x509/i2d_pr.c", - "src/crypto/x509/pkcs7.c", + "src/crypto/x509/internal.h", "src/crypto/x509/rsa_pss.c", "src/crypto/x509/t_crl.c", "src/crypto/x509/t_req.c", "src/crypto/x509/t_x509.c", "src/crypto/x509/t_x509a.c", + "src/crypto/x509/vpm_int.h", "src/crypto/x509/x509.c", "src/crypto/x509/x509_att.c", "src/crypto/x509/x509_cmp.c", @@ -228,7 +207,6 @@ crypto_sources = [ "src/crypto/x509/x509name.c", "src/crypto/x509/x509rset.c", "src/crypto/x509/x509spki.c", - "src/crypto/x509/x509type.c", "src/crypto/x509/x_algor.c", "src/crypto/x509/x_all.c", "src/crypto/x509/x_attrib.c", @@ -244,8 +222,10 @@ crypto_sources = [ "src/crypto/x509/x_val.c", "src/crypto/x509/x_x509.c", "src/crypto/x509/x_x509a.c", + "src/crypto/x509v3/ext_dat.h", "src/crypto/x509v3/pcy_cache.c", "src/crypto/x509v3/pcy_data.c", + "src/crypto/x509v3/pcy_int.h", "src/crypto/x509v3/pcy_lib.c", "src/crypto/x509v3/pcy_map.c", "src/crypto/x509v3/pcy_node.c", @@ -276,184 +256,298 @@ crypto_sources = [ "src/crypto/x509v3/v3_skey.c", "src/crypto/x509v3/v3_sxnet.c", "src/crypto/x509v3/v3_utl.c", + "src/include/openssl/aead.h", + "src/include/openssl/aes.h", + "src/include/openssl/arm_arch.h", + "src/include/openssl/asn1.h", + "src/include/openssl/asn1_mac.h", + "src/include/openssl/asn1t.h", + "src/include/openssl/base.h", + "src/include/openssl/base64.h", + "src/include/openssl/bio.h", + "src/include/openssl/blowfish.h", + "src/include/openssl/bn.h", + "src/include/openssl/buf.h", + "src/include/openssl/buffer.h", + "src/include/openssl/bytestring.h", + "src/include/openssl/cast.h", + "src/include/openssl/chacha.h", + "src/include/openssl/cipher.h", + "src/include/openssl/cmac.h", + "src/include/openssl/conf.h", + "src/include/openssl/cpu.h", + "src/include/openssl/crypto.h", + "src/include/openssl/curve25519.h", + "src/include/openssl/des.h", + "src/include/openssl/dh.h", + "src/include/openssl/digest.h", + "src/include/openssl/dsa.h", + "src/include/openssl/ec.h", + "src/include/openssl/ec_key.h", + "src/include/openssl/ecdh.h", + "src/include/openssl/ecdsa.h", + "src/include/openssl/engine.h", + "src/include/openssl/err.h", + "src/include/openssl/evp.h", + "src/include/openssl/ex_data.h", + "src/include/openssl/hkdf.h", + "src/include/openssl/hmac.h", + "src/include/openssl/is_boringssl.h", + "src/include/openssl/lhash.h", + "src/include/openssl/lhash_macros.h", + "src/include/openssl/md4.h", + "src/include/openssl/md5.h", + "src/include/openssl/mem.h", + "src/include/openssl/nid.h", + "src/include/openssl/obj.h", + "src/include/openssl/obj_mac.h", + "src/include/openssl/objects.h", + "src/include/openssl/opensslconf.h", + "src/include/openssl/opensslv.h", + "src/include/openssl/ossl_typ.h", + "src/include/openssl/pem.h", + "src/include/openssl/pkcs12.h", + "src/include/openssl/pkcs7.h", + "src/include/openssl/pkcs8.h", + "src/include/openssl/poly1305.h", + "src/include/openssl/pool.h", + "src/include/openssl/rand.h", + "src/include/openssl/rc4.h", + "src/include/openssl/ripemd.h", + "src/include/openssl/rsa.h", + "src/include/openssl/safestack.h", + "src/include/openssl/sha.h", + "src/include/openssl/span.h", + "src/include/openssl/srtp.h", + "src/include/openssl/stack.h", + "src/include/openssl/thread.h", + "src/include/openssl/type_check.h", + "src/include/openssl/x509.h", + "src/include/openssl/x509_vfy.h", + "src/include/openssl/x509v3.h", + "src/third_party/fiat/curve25519.c", + "src/third_party/fiat/internal.h", ] ssl_sources = [ - "src/ssl/custom_extensions.c", - "src/ssl/d1_both.c", - "src/ssl/d1_lib.c", - "src/ssl/d1_meth.c", - "src/ssl/d1_pkt.c", - "src/ssl/d1_srtp.c", - "src/ssl/dtls_record.c", - "src/ssl/handshake_client.c", - "src/ssl/handshake_server.c", - "src/ssl/pqueue/pqueue.c", - "src/ssl/s3_both.c", - "src/ssl/s3_enc.c", - "src/ssl/s3_lib.c", - "src/ssl/s3_meth.c", - "src/ssl/s3_pkt.c", - "src/ssl/ssl_aead_ctx.c", - "src/ssl/ssl_asn1.c", - "src/ssl/ssl_buffer.c", - "src/ssl/ssl_cert.c", - "src/ssl/ssl_cipher.c", - "src/ssl/ssl_ecdh.c", - "src/ssl/ssl_file.c", - "src/ssl/ssl_lib.c", - "src/ssl/ssl_rsa.c", - "src/ssl/ssl_session.c", - "src/ssl/ssl_stat.c", - "src/ssl/t1_enc.c", - "src/ssl/t1_lib.c", - "src/ssl/tls_record.c", + "src/include/openssl/dtls1.h", + "src/include/openssl/ssl.h", + "src/include/openssl/ssl3.h", + "src/include/openssl/tls1.h", + "src/ssl/bio_ssl.cc", + "src/ssl/custom_extensions.cc", + "src/ssl/d1_both.cc", + "src/ssl/d1_lib.cc", + "src/ssl/d1_pkt.cc", + "src/ssl/d1_srtp.cc", + "src/ssl/dtls_method.cc", + "src/ssl/dtls_record.cc", + "src/ssl/handshake.cc", + "src/ssl/handshake_client.cc", + "src/ssl/handshake_server.cc", + "src/ssl/internal.h", + "src/ssl/s3_both.cc", + "src/ssl/s3_lib.cc", + "src/ssl/s3_pkt.cc", + "src/ssl/ssl_aead_ctx.cc", + "src/ssl/ssl_asn1.cc", + "src/ssl/ssl_buffer.cc", + "src/ssl/ssl_cert.cc", + "src/ssl/ssl_cipher.cc", + "src/ssl/ssl_file.cc", + "src/ssl/ssl_key_share.cc", + "src/ssl/ssl_lib.cc", + "src/ssl/ssl_privkey.cc", + "src/ssl/ssl_session.cc", + "src/ssl/ssl_stat.cc", + "src/ssl/ssl_transcript.cc", + "src/ssl/ssl_versions.cc", + "src/ssl/ssl_x509.cc", + "src/ssl/t1_enc.cc", + "src/ssl/t1_lib.cc", + "src/ssl/tls13_both.cc", + "src/ssl/tls13_client.cc", + "src/ssl/tls13_enc.cc", + "src/ssl/tls13_server.cc", + "src/ssl/tls_method.cc", + "src/ssl/tls_record.cc", +] + +crypto_sources_ios_aarch64 = [ + "ios-aarch64/crypto/chacha/chacha-armv8.S", + "ios-aarch64/crypto/fipsmodule/aesv8-armx64.S", + "ios-aarch64/crypto/fipsmodule/armv8-mont.S", + "ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S", + "ios-aarch64/crypto/fipsmodule/sha1-armv8.S", + "ios-aarch64/crypto/fipsmodule/sha256-armv8.S", + "ios-aarch64/crypto/fipsmodule/sha512-armv8.S", +] + +crypto_sources_ios_arm = [ + "ios-arm/crypto/chacha/chacha-armv4.S", + "ios-arm/crypto/fipsmodule/aes-armv4.S", + "ios-arm/crypto/fipsmodule/aesv8-armx32.S", + "ios-arm/crypto/fipsmodule/armv4-mont.S", + "ios-arm/crypto/fipsmodule/bsaes-armv7.S", + "ios-arm/crypto/fipsmodule/ghash-armv4.S", + "ios-arm/crypto/fipsmodule/ghashv8-armx32.S", + "ios-arm/crypto/fipsmodule/sha1-armv4-large.S", + "ios-arm/crypto/fipsmodule/sha256-armv4.S", + "ios-arm/crypto/fipsmodule/sha512-armv4.S", ] crypto_sources_linux_aarch64 = [ - "linux-aarch64/crypto/aes/aesv8-armx64.S", - "linux-aarch64/crypto/bn/armv8-mont.S", "linux-aarch64/crypto/chacha/chacha-armv8.S", - "linux-aarch64/crypto/modes/ghashv8-armx64.S", - "linux-aarch64/crypto/sha/sha1-armv8.S", - "linux-aarch64/crypto/sha/sha256-armv8.S", - "linux-aarch64/crypto/sha/sha512-armv8.S", + "linux-aarch64/crypto/fipsmodule/aesv8-armx64.S", + "linux-aarch64/crypto/fipsmodule/armv8-mont.S", + "linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S", + "linux-aarch64/crypto/fipsmodule/sha1-armv8.S", + "linux-aarch64/crypto/fipsmodule/sha256-armv8.S", + "linux-aarch64/crypto/fipsmodule/sha512-armv8.S", ] crypto_sources_linux_arm = [ - "linux-arm/crypto/aes/aes-armv4.S", - "linux-arm/crypto/aes/aesv8-armx32.S", - "linux-arm/crypto/aes/bsaes-armv7.S", - "linux-arm/crypto/bn/armv4-mont.S", "linux-arm/crypto/chacha/chacha-armv4.S", - "linux-arm/crypto/modes/ghash-armv4.S", - "linux-arm/crypto/modes/ghashv8-armx32.S", - "linux-arm/crypto/sha/sha1-armv4-large.S", - "linux-arm/crypto/sha/sha256-armv4.S", - "linux-arm/crypto/sha/sha512-armv4.S", + "linux-arm/crypto/fipsmodule/aes-armv4.S", + "linux-arm/crypto/fipsmodule/aesv8-armx32.S", + "linux-arm/crypto/fipsmodule/armv4-mont.S", + "linux-arm/crypto/fipsmodule/bsaes-armv7.S", + "linux-arm/crypto/fipsmodule/ghash-armv4.S", + "linux-arm/crypto/fipsmodule/ghashv8-armx32.S", + "linux-arm/crypto/fipsmodule/sha1-armv4-large.S", + "linux-arm/crypto/fipsmodule/sha256-armv4.S", + "linux-arm/crypto/fipsmodule/sha512-armv4.S", "src/crypto/curve25519/asm/x25519-asm-arm.S", "src/crypto/poly1305/poly1305_arm_asm.S", ] +crypto_sources_linux_ppc64le = [ + "linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S", + "linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S", +] + crypto_sources_linux_x86 = [ - "linux-x86/crypto/aes/aes-586.S", - "linux-x86/crypto/aes/aesni-x86.S", - "linux-x86/crypto/aes/vpaes-x86.S", - "linux-x86/crypto/bn/bn-586.S", - "linux-x86/crypto/bn/co-586.S", - "linux-x86/crypto/bn/x86-mont.S", "linux-x86/crypto/chacha/chacha-x86.S", - "linux-x86/crypto/md5/md5-586.S", - "linux-x86/crypto/modes/ghash-x86.S", - "linux-x86/crypto/rc4/rc4-586.S", - "linux-x86/crypto/sha/sha1-586.S", - "linux-x86/crypto/sha/sha256-586.S", - "linux-x86/crypto/sha/sha512-586.S", + "linux-x86/crypto/fipsmodule/aes-586.S", + "linux-x86/crypto/fipsmodule/aesni-x86.S", + "linux-x86/crypto/fipsmodule/bn-586.S", + "linux-x86/crypto/fipsmodule/co-586.S", + "linux-x86/crypto/fipsmodule/ghash-x86.S", + "linux-x86/crypto/fipsmodule/md5-586.S", + "linux-x86/crypto/fipsmodule/sha1-586.S", + "linux-x86/crypto/fipsmodule/sha256-586.S", + "linux-x86/crypto/fipsmodule/sha512-586.S", + "linux-x86/crypto/fipsmodule/vpaes-x86.S", + "linux-x86/crypto/fipsmodule/x86-mont.S", ] crypto_sources_linux_x86_64 = [ - "linux-x86_64/crypto/aes/aes-x86_64.S", - "linux-x86_64/crypto/aes/aesni-x86_64.S", - "linux-x86_64/crypto/aes/bsaes-x86_64.S", - "linux-x86_64/crypto/aes/vpaes-x86_64.S", - "linux-x86_64/crypto/bn/rsaz-avx2.S", - "linux-x86_64/crypto/bn/rsaz-x86_64.S", - "linux-x86_64/crypto/bn/x86_64-mont.S", - "linux-x86_64/crypto/bn/x86_64-mont5.S", "linux-x86_64/crypto/chacha/chacha-x86_64.S", - "linux-x86_64/crypto/ec/p256-x86_64-asm.S", - "linux-x86_64/crypto/md5/md5-x86_64.S", - "linux-x86_64/crypto/modes/aesni-gcm-x86_64.S", - "linux-x86_64/crypto/modes/ghash-x86_64.S", - "linux-x86_64/crypto/rand/rdrand-x86_64.S", - "linux-x86_64/crypto/rc4/rc4-x86_64.S", - "linux-x86_64/crypto/sha/sha1-x86_64.S", - "linux-x86_64/crypto/sha/sha256-x86_64.S", - "linux-x86_64/crypto/sha/sha512-x86_64.S", + "linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S", + "linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S", + "linux-x86_64/crypto/fipsmodule/aes-x86_64.S", + "linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S", + "linux-x86_64/crypto/fipsmodule/aesni-x86_64.S", + "linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S", + "linux-x86_64/crypto/fipsmodule/ghash-x86_64.S", + "linux-x86_64/crypto/fipsmodule/md5-x86_64.S", + "linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S", + "linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S", + "linux-x86_64/crypto/fipsmodule/rsaz-avx2.S", + "linux-x86_64/crypto/fipsmodule/sha1-x86_64.S", + "linux-x86_64/crypto/fipsmodule/sha256-x86_64.S", + "linux-x86_64/crypto/fipsmodule/sha512-x86_64.S", + "linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S", + "linux-x86_64/crypto/fipsmodule/x86_64-mont.S", + "linux-x86_64/crypto/fipsmodule/x86_64-mont5.S", "src/crypto/curve25519/asm/x25519-asm-x86_64.S", ] crypto_sources_mac_x86 = [ - "mac-x86/crypto/aes/aes-586.S", - "mac-x86/crypto/aes/aesni-x86.S", - "mac-x86/crypto/aes/vpaes-x86.S", - "mac-x86/crypto/bn/bn-586.S", - "mac-x86/crypto/bn/co-586.S", - "mac-x86/crypto/bn/x86-mont.S", "mac-x86/crypto/chacha/chacha-x86.S", - "mac-x86/crypto/md5/md5-586.S", - "mac-x86/crypto/modes/ghash-x86.S", - "mac-x86/crypto/rc4/rc4-586.S", - "mac-x86/crypto/sha/sha1-586.S", - "mac-x86/crypto/sha/sha256-586.S", - "mac-x86/crypto/sha/sha512-586.S", + "mac-x86/crypto/fipsmodule/aes-586.S", + "mac-x86/crypto/fipsmodule/aesni-x86.S", + "mac-x86/crypto/fipsmodule/bn-586.S", + "mac-x86/crypto/fipsmodule/co-586.S", + "mac-x86/crypto/fipsmodule/ghash-x86.S", + "mac-x86/crypto/fipsmodule/md5-586.S", + "mac-x86/crypto/fipsmodule/sha1-586.S", + "mac-x86/crypto/fipsmodule/sha256-586.S", + "mac-x86/crypto/fipsmodule/sha512-586.S", + "mac-x86/crypto/fipsmodule/vpaes-x86.S", + "mac-x86/crypto/fipsmodule/x86-mont.S", ] crypto_sources_mac_x86_64 = [ - "mac-x86_64/crypto/aes/aes-x86_64.S", - "mac-x86_64/crypto/aes/aesni-x86_64.S", - "mac-x86_64/crypto/aes/bsaes-x86_64.S", - "mac-x86_64/crypto/aes/vpaes-x86_64.S", - "mac-x86_64/crypto/bn/rsaz-avx2.S", - "mac-x86_64/crypto/bn/rsaz-x86_64.S", - "mac-x86_64/crypto/bn/x86_64-mont.S", - "mac-x86_64/crypto/bn/x86_64-mont5.S", "mac-x86_64/crypto/chacha/chacha-x86_64.S", - "mac-x86_64/crypto/ec/p256-x86_64-asm.S", - "mac-x86_64/crypto/md5/md5-x86_64.S", - "mac-x86_64/crypto/modes/aesni-gcm-x86_64.S", - "mac-x86_64/crypto/modes/ghash-x86_64.S", - "mac-x86_64/crypto/rand/rdrand-x86_64.S", - "mac-x86_64/crypto/rc4/rc4-x86_64.S", - "mac-x86_64/crypto/sha/sha1-x86_64.S", - "mac-x86_64/crypto/sha/sha256-x86_64.S", - "mac-x86_64/crypto/sha/sha512-x86_64.S", + "mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S", + "mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S", + "mac-x86_64/crypto/fipsmodule/aes-x86_64.S", + "mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S", + "mac-x86_64/crypto/fipsmodule/aesni-x86_64.S", + "mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S", + "mac-x86_64/crypto/fipsmodule/ghash-x86_64.S", + "mac-x86_64/crypto/fipsmodule/md5-x86_64.S", + "mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S", + "mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S", + "mac-x86_64/crypto/fipsmodule/rsaz-avx2.S", + "mac-x86_64/crypto/fipsmodule/sha1-x86_64.S", + "mac-x86_64/crypto/fipsmodule/sha256-x86_64.S", + "mac-x86_64/crypto/fipsmodule/sha512-x86_64.S", + "mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S", + "mac-x86_64/crypto/fipsmodule/x86_64-mont.S", + "mac-x86_64/crypto/fipsmodule/x86_64-mont5.S", "src/crypto/curve25519/asm/x25519-asm-x86_64.S", ] crypto_sources_win_x86 = [ - "win-x86/crypto/aes/aes-586.asm", - "win-x86/crypto/aes/aesni-x86.asm", - "win-x86/crypto/aes/vpaes-x86.asm", - "win-x86/crypto/bn/bn-586.asm", - "win-x86/crypto/bn/co-586.asm", - "win-x86/crypto/bn/x86-mont.asm", "win-x86/crypto/chacha/chacha-x86.asm", - "win-x86/crypto/md5/md5-586.asm", - "win-x86/crypto/modes/ghash-x86.asm", - "win-x86/crypto/rc4/rc4-586.asm", - "win-x86/crypto/sha/sha1-586.asm", - "win-x86/crypto/sha/sha256-586.asm", - "win-x86/crypto/sha/sha512-586.asm", + "win-x86/crypto/fipsmodule/aes-586.asm", + "win-x86/crypto/fipsmodule/aesni-x86.asm", + "win-x86/crypto/fipsmodule/bn-586.asm", + "win-x86/crypto/fipsmodule/co-586.asm", + "win-x86/crypto/fipsmodule/ghash-x86.asm", + "win-x86/crypto/fipsmodule/md5-586.asm", + "win-x86/crypto/fipsmodule/sha1-586.asm", + "win-x86/crypto/fipsmodule/sha256-586.asm", + "win-x86/crypto/fipsmodule/sha512-586.asm", + "win-x86/crypto/fipsmodule/vpaes-x86.asm", + "win-x86/crypto/fipsmodule/x86-mont.asm", ] crypto_sources_win_x86_64 = [ - "win-x86_64/crypto/aes/aes-x86_64.asm", - "win-x86_64/crypto/aes/aesni-x86_64.asm", - "win-x86_64/crypto/aes/bsaes-x86_64.asm", - "win-x86_64/crypto/aes/vpaes-x86_64.asm", - "win-x86_64/crypto/bn/rsaz-avx2.asm", - "win-x86_64/crypto/bn/rsaz-x86_64.asm", - "win-x86_64/crypto/bn/x86_64-mont.asm", - "win-x86_64/crypto/bn/x86_64-mont5.asm", "win-x86_64/crypto/chacha/chacha-x86_64.asm", - "win-x86_64/crypto/ec/p256-x86_64-asm.asm", - "win-x86_64/crypto/md5/md5-x86_64.asm", - "win-x86_64/crypto/modes/aesni-gcm-x86_64.asm", - "win-x86_64/crypto/modes/ghash-x86_64.asm", - "win-x86_64/crypto/rand/rdrand-x86_64.asm", - "win-x86_64/crypto/rc4/rc4-x86_64.asm", - "win-x86_64/crypto/sha/sha1-x86_64.asm", - "win-x86_64/crypto/sha/sha256-x86_64.asm", - "win-x86_64/crypto/sha/sha512-x86_64.asm", + "win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm", + "win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm", + "win-x86_64/crypto/fipsmodule/aes-x86_64.asm", + "win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm", + "win-x86_64/crypto/fipsmodule/aesni-x86_64.asm", + "win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm", + "win-x86_64/crypto/fipsmodule/ghash-x86_64.asm", + "win-x86_64/crypto/fipsmodule/md5-x86_64.asm", + "win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm", + "win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm", + "win-x86_64/crypto/fipsmodule/rsaz-avx2.asm", + "win-x86_64/crypto/fipsmodule/sha1-x86_64.asm", + "win-x86_64/crypto/fipsmodule/sha256-x86_64.asm", + "win-x86_64/crypto/fipsmodule/sha512-x86_64.asm", + "win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm", + "win-x86_64/crypto/fipsmodule/x86_64-mont.asm", + "win-x86_64/crypto/fipsmodule/x86_64-mont5.asm", ] fuzzers = [ + "bn_div", + "bn_mod_exp", "cert", "client", + "dtls_client", + "dtls_server", "pkcs8", "privkey", "read_pem", "server", + "session", "spki", + "ssl_ctx_api", ] diff --git a/packager/third_party/boringssl/BUILD.generated_tests.gni b/packager/third_party/boringssl/BUILD.generated_tests.gni index 8b7ea3cc59..44d653d281 100644 --- a/packager/third_party/boringssl/BUILD.generated_tests.gni +++ b/packager/third_party/boringssl/BUILD.generated_tests.gni @@ -4,594 +4,73 @@ # This file is created by generate_build_files.py. Do not edit manually. -_test_support_sources = [ +test_support_sources = [ "src/crypto/test/file_test.cc", "src/crypto/test/file_test.h", + "src/crypto/test/gtest_main.h", "src/crypto/test/malloc.cc", - "src/crypto/test/scoped_types.h", "src/crypto/test/test_util.cc", "src/crypto/test/test_util.h", "src/ssl/test/async_bio.h", + "src/ssl/test/fuzzer.h", + "src/ssl/test/fuzzer_tags.h", "src/ssl/test/packeted_bio.h", - "src/ssl/test/scoped_types.h", "src/ssl/test/test_config.h", ] -template("create_tests") { - executable("boringssl_aes_test") { - sources = [ - "src/crypto/aes/aes_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } +crypto_test_sources = [ + "crypto_test_data.cc", + "src/crypto/asn1/asn1_test.cc", + "src/crypto/base64/base64_test.cc", + "src/crypto/bio/bio_test.cc", + "src/crypto/buf/buf_test.cc", + "src/crypto/bytestring/bytestring_test.cc", + "src/crypto/chacha/chacha_test.cc", + "src/crypto/cipher_extra/aead_test.cc", + "src/crypto/cipher_extra/cipher_test.cc", + "src/crypto/cmac/cmac_test.cc", + "src/crypto/compiler_test.cc", + "src/crypto/constant_time_test.cc", + "src/crypto/curve25519/ed25519_test.cc", + "src/crypto/curve25519/spake25519_test.cc", + "src/crypto/curve25519/x25519_test.cc", + "src/crypto/dh/dh_test.cc", + "src/crypto/digest_extra/digest_test.cc", + "src/crypto/dsa/dsa_test.cc", + "src/crypto/ecdh/ecdh_test.cc", + "src/crypto/err/err_test.cc", + "src/crypto/evp/evp_extra_test.cc", + "src/crypto/evp/evp_test.cc", + "src/crypto/evp/pbkdf_test.cc", + "src/crypto/evp/scrypt_test.cc", + "src/crypto/fipsmodule/aes/aes_test.cc", + "src/crypto/fipsmodule/bn/bn_test.cc", + "src/crypto/fipsmodule/ec/ec_test.cc", + "src/crypto/fipsmodule/ec/p256-x86_64_test.cc", + "src/crypto/fipsmodule/ecdsa/ecdsa_test.cc", + "src/crypto/fipsmodule/modes/gcm_test.cc", + "src/crypto/fipsmodule/rand/ctrdrbg_test.cc", + "src/crypto/hkdf/hkdf_test.cc", + "src/crypto/hmac_extra/hmac_test.cc", + "src/crypto/lhash/lhash_test.cc", + "src/crypto/obj/obj_test.cc", + "src/crypto/pkcs7/pkcs7_test.cc", + "src/crypto/pkcs8/pkcs12_test.cc", + "src/crypto/pkcs8/pkcs8_test.cc", + "src/crypto/poly1305/poly1305_test.cc", + "src/crypto/pool/pool_test.cc", + "src/crypto/refcount_test.cc", + "src/crypto/rsa_extra/rsa_test.cc", + "src/crypto/test/file_test_gtest.cc", + "src/crypto/test/gtest_main.cc", + "src/crypto/thread_test.cc", + "src/crypto/x509/x509_test.cc", + "src/crypto/x509v3/tab_test.cc", + "src/crypto/x509v3/v3name_test.cc", +] - executable("boringssl_asn1_test") { - sources = [ - "src/crypto/asn1/asn1_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_base64_test") { - sources = [ - "src/crypto/base64/base64_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_bio_test") { - sources = [ - "src/crypto/bio/bio_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_bn_test") { - sources = [ - "src/crypto/bn/bn_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_bytestring_test") { - sources = [ - "src/crypto/bytestring/bytestring_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_chacha_test") { - sources = [ - "src/crypto/chacha/chacha_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_aead_test") { - sources = [ - "src/crypto/cipher/aead_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_cipher_test") { - sources = [ - "src/crypto/cipher/cipher_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_cmac_test") { - sources = [ - "src/crypto/cmac/cmac_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_constant_time_test") { - sources = [ - "src/crypto/constant_time_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_ed25519_test") { - sources = [ - "src/crypto/curve25519/ed25519_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_spake25519_test") { - sources = [ - "src/crypto/curve25519/spake25519_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_x25519_test") { - sources = [ - "src/crypto/curve25519/x25519_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_dh_test") { - sources = [ - "src/crypto/dh/dh_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_digest_test") { - sources = [ - "src/crypto/digest/digest_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_dsa_test") { - sources = [ - "src/crypto/dsa/dsa_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_ec_test") { - sources = [ - "src/crypto/ec/ec_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_example_mul") { - sources = [ - "src/crypto/ec/example_mul.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_ecdsa_test") { - sources = [ - "src/crypto/ecdsa/ecdsa_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_err_test") { - sources = [ - "src/crypto/err/err_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_evp_extra_test") { - sources = [ - "src/crypto/evp/evp_extra_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_evp_test") { - sources = [ - "src/crypto/evp/evp_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_pbkdf_test") { - sources = [ - "src/crypto/evp/pbkdf_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_hkdf_test") { - sources = [ - "src/crypto/hkdf/hkdf_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_hmac_test") { - sources = [ - "src/crypto/hmac/hmac_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_lhash_test") { - sources = [ - "src/crypto/lhash/lhash_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_gcm_test") { - sources = [ - "src/crypto/modes/gcm_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_newhope_statistical_test") { - sources = [ - "src/crypto/newhope/newhope_statistical_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_newhope_test") { - sources = [ - "src/crypto/newhope/newhope_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_newhope_vectors_test") { - sources = [ - "src/crypto/newhope/newhope_vectors_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_obj_test") { - sources = [ - "src/crypto/obj/obj_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_pkcs12_test") { - sources = [ - "src/crypto/pkcs8/pkcs12_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_pkcs8_test") { - sources = [ - "src/crypto/pkcs8/pkcs8_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_poly1305_test") { - sources = [ - "src/crypto/poly1305/poly1305_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_refcount_test") { - sources = [ - "src/crypto/refcount_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_rsa_test") { - sources = [ - "src/crypto/rsa/rsa_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_thread_test") { - sources = [ - "src/crypto/thread_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_pkcs7_test") { - sources = [ - "src/crypto/x509/pkcs7_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_x509_test") { - sources = [ - "src/crypto/x509/x509_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_tab_test") { - sources = [ - "src/crypto/x509v3/tab_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_v3name_test") { - sources = [ - "src/crypto/x509v3/v3name_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_pqueue_test") { - sources = [ - "src/ssl/pqueue/pqueue_test.c", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - executable("boringssl_ssl_test") { - sources = [ - "src/ssl/ssl_test.cc", - ] - sources += _test_support_sources - if (defined(invoker.configs_exclude)) { - configs -= invoker.configs_exclude - } - configs += invoker.configs - deps = invoker.deps - } - - group(target_name) { - deps = [ - ":boringssl_aead_test", - ":boringssl_aes_test", - ":boringssl_asn1_test", - ":boringssl_base64_test", - ":boringssl_bio_test", - ":boringssl_bn_test", - ":boringssl_bytestring_test", - ":boringssl_chacha_test", - ":boringssl_cipher_test", - ":boringssl_cmac_test", - ":boringssl_constant_time_test", - ":boringssl_dh_test", - ":boringssl_digest_test", - ":boringssl_dsa_test", - ":boringssl_ec_test", - ":boringssl_ecdsa_test", - ":boringssl_ed25519_test", - ":boringssl_err_test", - ":boringssl_evp_extra_test", - ":boringssl_evp_test", - ":boringssl_example_mul", - ":boringssl_gcm_test", - ":boringssl_hkdf_test", - ":boringssl_hmac_test", - ":boringssl_lhash_test", - ":boringssl_newhope_statistical_test", - ":boringssl_newhope_test", - ":boringssl_newhope_vectors_test", - ":boringssl_obj_test", - ":boringssl_pbkdf_test", - ":boringssl_pkcs12_test", - ":boringssl_pkcs7_test", - ":boringssl_pkcs8_test", - ":boringssl_poly1305_test", - ":boringssl_pqueue_test", - ":boringssl_refcount_test", - ":boringssl_rsa_test", - ":boringssl_spake25519_test", - ":boringssl_ssl_test", - ":boringssl_tab_test", - ":boringssl_thread_test", - ":boringssl_v3name_test", - ":boringssl_x25519_test", - ":boringssl_x509_test", - ] - } -} +ssl_test_sources = [ + "src/crypto/test/gtest_main.cc", + "src/ssl/span_test.cc", + "src/ssl/ssl_test.cc", +] diff --git a/packager/third_party/boringssl/boringssl.gypi b/packager/third_party/boringssl/boringssl.gypi index 65a483e90e..17f7a6c5ab 100644 --- a/packager/third_party/boringssl/boringssl.gypi +++ b/packager/third_party/boringssl/boringssl.gypi @@ -7,43 +7,52 @@ { 'variables': { 'boringssl_ssl_sources': [ - 'src/ssl/custom_extensions.c', - 'src/ssl/d1_both.c', - 'src/ssl/d1_lib.c', - 'src/ssl/d1_meth.c', - 'src/ssl/d1_pkt.c', - 'src/ssl/d1_srtp.c', - 'src/ssl/dtls_record.c', - 'src/ssl/handshake_client.c', - 'src/ssl/handshake_server.c', - 'src/ssl/pqueue/pqueue.c', - 'src/ssl/s3_both.c', - 'src/ssl/s3_enc.c', - 'src/ssl/s3_lib.c', - 'src/ssl/s3_meth.c', - 'src/ssl/s3_pkt.c', - 'src/ssl/ssl_aead_ctx.c', - 'src/ssl/ssl_asn1.c', - 'src/ssl/ssl_buffer.c', - 'src/ssl/ssl_cert.c', - 'src/ssl/ssl_cipher.c', - 'src/ssl/ssl_ecdh.c', - 'src/ssl/ssl_file.c', - 'src/ssl/ssl_lib.c', - 'src/ssl/ssl_rsa.c', - 'src/ssl/ssl_session.c', - 'src/ssl/ssl_stat.c', - 'src/ssl/t1_enc.c', - 'src/ssl/t1_lib.c', - 'src/ssl/tls_record.c', + 'src/include/openssl/dtls1.h', + 'src/include/openssl/ssl.h', + 'src/include/openssl/ssl3.h', + 'src/include/openssl/tls1.h', + 'src/ssl/bio_ssl.cc', + 'src/ssl/custom_extensions.cc', + 'src/ssl/d1_both.cc', + 'src/ssl/d1_lib.cc', + 'src/ssl/d1_pkt.cc', + 'src/ssl/d1_srtp.cc', + 'src/ssl/dtls_method.cc', + 'src/ssl/dtls_record.cc', + 'src/ssl/handshake.cc', + 'src/ssl/handshake_client.cc', + 'src/ssl/handshake_server.cc', + 'src/ssl/internal.h', + 'src/ssl/s3_both.cc', + 'src/ssl/s3_lib.cc', + 'src/ssl/s3_pkt.cc', + 'src/ssl/ssl_aead_ctx.cc', + 'src/ssl/ssl_asn1.cc', + 'src/ssl/ssl_buffer.cc', + 'src/ssl/ssl_cert.cc', + 'src/ssl/ssl_cipher.cc', + 'src/ssl/ssl_file.cc', + 'src/ssl/ssl_key_share.cc', + 'src/ssl/ssl_lib.cc', + 'src/ssl/ssl_privkey.cc', + 'src/ssl/ssl_session.cc', + 'src/ssl/ssl_stat.cc', + 'src/ssl/ssl_transcript.cc', + 'src/ssl/ssl_versions.cc', + 'src/ssl/ssl_x509.cc', + 'src/ssl/t1_enc.cc', + 'src/ssl/t1_lib.cc', + 'src/ssl/tls13_both.cc', + 'src/ssl/tls13_client.cc', + 'src/ssl/tls13_enc.cc', + 'src/ssl/tls13_server.cc', + 'src/ssl/tls_method.cc', + 'src/ssl/tls_record.cc', ], 'boringssl_crypto_sources': [ 'err_data.c', - 'src/crypto/aes/aes.c', - 'src/crypto/aes/mode_wrappers.c', 'src/crypto/asn1/a_bitstr.c', 'src/crypto/asn1/a_bool.c', - 'src/crypto/asn1/a_bytes.c', 'src/crypto/asn1/a_d2i_fp.c', 'src/crypto/asn1/a_dup.c', 'src/crypto/asn1/a_enum.c', @@ -60,135 +69,116 @@ 'src/crypto/asn1/a_utctm.c', 'src/crypto/asn1/a_utf8.c', 'src/crypto/asn1/asn1_lib.c', + 'src/crypto/asn1/asn1_locl.h', 'src/crypto/asn1/asn1_par.c', 'src/crypto/asn1/asn_pack.c', 'src/crypto/asn1/f_enum.c', 'src/crypto/asn1/f_int.c', 'src/crypto/asn1/f_string.c', - 'src/crypto/asn1/t_bitst.c', 'src/crypto/asn1/tasn_dec.c', 'src/crypto/asn1/tasn_enc.c', 'src/crypto/asn1/tasn_fre.c', 'src/crypto/asn1/tasn_new.c', 'src/crypto/asn1/tasn_typ.c', 'src/crypto/asn1/tasn_utl.c', - 'src/crypto/asn1/x_bignum.c', - 'src/crypto/asn1/x_long.c', + 'src/crypto/asn1/time_support.c', 'src/crypto/base64/base64.c', 'src/crypto/bio/bio.c', 'src/crypto/bio/bio_mem.c', - 'src/crypto/bio/buffer.c', 'src/crypto/bio/connect.c', 'src/crypto/bio/fd.c', 'src/crypto/bio/file.c', 'src/crypto/bio/hexdump.c', + 'src/crypto/bio/internal.h', 'src/crypto/bio/pair.c', 'src/crypto/bio/printf.c', 'src/crypto/bio/socket.c', 'src/crypto/bio/socket_helper.c', - 'src/crypto/bn/add.c', - 'src/crypto/bn/asm/x86_64-gcc.c', - 'src/crypto/bn/bn.c', - 'src/crypto/bn/bn_asn1.c', - 'src/crypto/bn/cmp.c', - 'src/crypto/bn/convert.c', - 'src/crypto/bn/ctx.c', - 'src/crypto/bn/div.c', - 'src/crypto/bn/exponentiation.c', - 'src/crypto/bn/gcd.c', - 'src/crypto/bn/generic.c', - 'src/crypto/bn/kronecker.c', - 'src/crypto/bn/montgomery.c', - 'src/crypto/bn/mul.c', - 'src/crypto/bn/prime.c', - 'src/crypto/bn/random.c', - 'src/crypto/bn/rsaz_exp.c', - 'src/crypto/bn/shift.c', - 'src/crypto/bn/sqrt.c', + 'src/crypto/bn_extra/bn_asn1.c', + 'src/crypto/bn_extra/convert.c', 'src/crypto/buf/buf.c', 'src/crypto/bytestring/asn1_compat.c', 'src/crypto/bytestring/ber.c', 'src/crypto/bytestring/cbb.c', 'src/crypto/bytestring/cbs.c', + 'src/crypto/bytestring/internal.h', 'src/crypto/chacha/chacha.c', - 'src/crypto/cipher/aead.c', - 'src/crypto/cipher/cipher.c', - 'src/crypto/cipher/derive_key.c', - 'src/crypto/cipher/e_aes.c', - 'src/crypto/cipher/e_chacha20poly1305.c', - 'src/crypto/cipher/e_des.c', - 'src/crypto/cipher/e_null.c', - 'src/crypto/cipher/e_rc2.c', - 'src/crypto/cipher/e_rc4.c', - 'src/crypto/cipher/e_ssl3.c', - 'src/crypto/cipher/e_tls.c', - 'src/crypto/cipher/tls_cbc.c', + 'src/crypto/cipher_extra/cipher_extra.c', + 'src/crypto/cipher_extra/derive_key.c', + 'src/crypto/cipher_extra/e_aesctrhmac.c', + 'src/crypto/cipher_extra/e_aesgcmsiv.c', + 'src/crypto/cipher_extra/e_chacha20poly1305.c', + 'src/crypto/cipher_extra/e_null.c', + 'src/crypto/cipher_extra/e_rc2.c', + 'src/crypto/cipher_extra/e_rc4.c', + 'src/crypto/cipher_extra/e_ssl3.c', + 'src/crypto/cipher_extra/e_tls.c', + 'src/crypto/cipher_extra/internal.h', + 'src/crypto/cipher_extra/tls_cbc.c', 'src/crypto/cmac/cmac.c', 'src/crypto/conf/conf.c', + 'src/crypto/conf/conf_def.h', + 'src/crypto/conf/internal.h', 'src/crypto/cpu-aarch64-linux.c', 'src/crypto/cpu-arm-linux.c', 'src/crypto/cpu-arm.c', 'src/crypto/cpu-intel.c', + 'src/crypto/cpu-ppc64le.c', 'src/crypto/crypto.c', - 'src/crypto/curve25519/curve25519.c', 'src/crypto/curve25519/spake25519.c', 'src/crypto/curve25519/x25519-x86_64.c', - 'src/crypto/des/des.c', 'src/crypto/dh/check.c', 'src/crypto/dh/dh.c', 'src/crypto/dh/dh_asn1.c', 'src/crypto/dh/params.c', - 'src/crypto/digest/digest.c', - 'src/crypto/digest/digests.c', + 'src/crypto/digest_extra/digest_extra.c', 'src/crypto/dsa/dsa.c', 'src/crypto/dsa/dsa_asn1.c', - 'src/crypto/ec/ec.c', - 'src/crypto/ec/ec_asn1.c', - 'src/crypto/ec/ec_key.c', - 'src/crypto/ec/ec_montgomery.c', - 'src/crypto/ec/oct.c', - 'src/crypto/ec/p224-64.c', - 'src/crypto/ec/p256-64.c', - 'src/crypto/ec/p256-x86_64.c', - 'src/crypto/ec/simple.c', - 'src/crypto/ec/util-64.c', - 'src/crypto/ec/wnaf.c', + 'src/crypto/ec_extra/ec_asn1.c', 'src/crypto/ecdh/ecdh.c', - 'src/crypto/ecdsa/ecdsa.c', - 'src/crypto/ecdsa/ecdsa_asn1.c', + 'src/crypto/ecdsa_extra/ecdsa_asn1.c', 'src/crypto/engine/engine.c', 'src/crypto/err/err.c', + 'src/crypto/err/internal.h', 'src/crypto/evp/digestsign.c', 'src/crypto/evp/evp.c', 'src/crypto/evp/evp_asn1.c', 'src/crypto/evp/evp_ctx.c', + 'src/crypto/evp/internal.h', 'src/crypto/evp/p_dsa_asn1.c', 'src/crypto/evp/p_ec.c', 'src/crypto/evp/p_ec_asn1.c', + 'src/crypto/evp/p_ed25519.c', + 'src/crypto/evp/p_ed25519_asn1.c', 'src/crypto/evp/p_rsa.c', 'src/crypto/evp/p_rsa_asn1.c', 'src/crypto/evp/pbkdf.c', 'src/crypto/evp/print.c', + 'src/crypto/evp/scrypt.c', 'src/crypto/evp/sign.c', 'src/crypto/ex_data.c', + 'src/crypto/fipsmodule/aes/internal.h', + 'src/crypto/fipsmodule/bcm.c', + 'src/crypto/fipsmodule/bn/internal.h', + 'src/crypto/fipsmodule/bn/rsaz_exp.h', + 'src/crypto/fipsmodule/cipher/internal.h', + 'src/crypto/fipsmodule/delocate.h', + 'src/crypto/fipsmodule/des/internal.h', + 'src/crypto/fipsmodule/digest/internal.h', + 'src/crypto/fipsmodule/digest/md32_common.h', + 'src/crypto/fipsmodule/ec/internal.h', + 'src/crypto/fipsmodule/ec/p256-x86_64-table.h', + 'src/crypto/fipsmodule/ec/p256-x86_64.h', + 'src/crypto/fipsmodule/is_fips.c', + 'src/crypto/fipsmodule/modes/internal.h', + 'src/crypto/fipsmodule/rand/internal.h', + 'src/crypto/fipsmodule/rsa/internal.h', 'src/crypto/hkdf/hkdf.c', - 'src/crypto/hmac/hmac.c', + 'src/crypto/internal.h', 'src/crypto/lhash/lhash.c', - 'src/crypto/md4/md4.c', - 'src/crypto/md5/md5.c', 'src/crypto/mem.c', - 'src/crypto/modes/cbc.c', - 'src/crypto/modes/cfb.c', - 'src/crypto/modes/ctr.c', - 'src/crypto/modes/gcm.c', - 'src/crypto/modes/ofb.c', - 'src/crypto/newhope/error_correction.c', - 'src/crypto/newhope/newhope.c', - 'src/crypto/newhope/ntt.c', - 'src/crypto/newhope/poly.c', - 'src/crypto/newhope/precomp.c', - 'src/crypto/newhope/reduce.c', 'src/crypto/obj/obj.c', + 'src/crypto/obj/obj_dat.h', 'src/crypto/obj/obj_xref.c', 'src/crypto/pem/pem_all.c', 'src/crypto/pem/pem_info.c', @@ -198,34 +188,33 @@ 'src/crypto/pem/pem_pkey.c', 'src/crypto/pem/pem_x509.c', 'src/crypto/pem/pem_xaux.c', - 'src/crypto/pkcs8/p5_pbe.c', + 'src/crypto/pkcs7/internal.h', + 'src/crypto/pkcs7/pkcs7.c', + 'src/crypto/pkcs7/pkcs7_x509.c', + 'src/crypto/pkcs8/internal.h', 'src/crypto/pkcs8/p5_pbev2.c', - 'src/crypto/pkcs8/p8_pkey.c', 'src/crypto/pkcs8/pkcs8.c', + 'src/crypto/pkcs8/pkcs8_x509.c', + 'src/crypto/poly1305/internal.h', 'src/crypto/poly1305/poly1305.c', 'src/crypto/poly1305/poly1305_arm.c', 'src/crypto/poly1305/poly1305_vec.c', - 'src/crypto/rand/deterministic.c', - 'src/crypto/rand/rand.c', - 'src/crypto/rand/urandom.c', - 'src/crypto/rand/windows.c', + 'src/crypto/pool/internal.h', + 'src/crypto/pool/pool.c', + 'src/crypto/rand_extra/deterministic.c', + 'src/crypto/rand_extra/forkunsafe.c', + 'src/crypto/rand_extra/fuchsia.c', + 'src/crypto/rand_extra/rand_extra.c', + 'src/crypto/rand_extra/windows.c', 'src/crypto/rc4/rc4.c', 'src/crypto/refcount_c11.c', 'src/crypto/refcount_lock.c', - 'src/crypto/rsa/blinding.c', - 'src/crypto/rsa/padding.c', - 'src/crypto/rsa/rsa.c', - 'src/crypto/rsa/rsa_asn1.c', - 'src/crypto/rsa/rsa_impl.c', - 'src/crypto/sha/sha1.c', - 'src/crypto/sha/sha256.c', - 'src/crypto/sha/sha512.c', + 'src/crypto/rsa_extra/rsa_asn1.c', 'src/crypto/stack/stack.c', 'src/crypto/thread.c', 'src/crypto/thread_none.c', 'src/crypto/thread_pthread.c', 'src/crypto/thread_win.c', - 'src/crypto/time_support.c', 'src/crypto/x509/a_digest.c', 'src/crypto/x509/a_sign.c', 'src/crypto/x509/a_strex.c', @@ -234,13 +223,15 @@ 'src/crypto/x509/asn1_gen.c', 'src/crypto/x509/by_dir.c', 'src/crypto/x509/by_file.c', + 'src/crypto/x509/charmap.h', 'src/crypto/x509/i2d_pr.c', - 'src/crypto/x509/pkcs7.c', + 'src/crypto/x509/internal.h', 'src/crypto/x509/rsa_pss.c', 'src/crypto/x509/t_crl.c', 'src/crypto/x509/t_req.c', 'src/crypto/x509/t_x509.c', 'src/crypto/x509/t_x509a.c', + 'src/crypto/x509/vpm_int.h', 'src/crypto/x509/x509.c', 'src/crypto/x509/x509_att.c', 'src/crypto/x509/x509_cmp.c', @@ -261,7 +252,6 @@ 'src/crypto/x509/x509name.c', 'src/crypto/x509/x509rset.c', 'src/crypto/x509/x509spki.c', - 'src/crypto/x509/x509type.c', 'src/crypto/x509/x_algor.c', 'src/crypto/x509/x_all.c', 'src/crypto/x509/x_attrib.c', @@ -277,8 +267,10 @@ 'src/crypto/x509/x_val.c', 'src/crypto/x509/x_x509.c', 'src/crypto/x509/x_x509a.c', + 'src/crypto/x509v3/ext_dat.h', 'src/crypto/x509v3/pcy_cache.c', 'src/crypto/x509v3/pcy_data.c', + 'src/crypto/x509v3/pcy_int.h', 'src/crypto/x509v3/pcy_lib.c', 'src/crypto/x509v3/pcy_map.c', 'src/crypto/x509v3/pcy_node.c', @@ -309,136 +301,229 @@ 'src/crypto/x509v3/v3_skey.c', 'src/crypto/x509v3/v3_sxnet.c', 'src/crypto/x509v3/v3_utl.c', + 'src/include/openssl/aead.h', + 'src/include/openssl/aes.h', + 'src/include/openssl/arm_arch.h', + 'src/include/openssl/asn1.h', + 'src/include/openssl/asn1_mac.h', + 'src/include/openssl/asn1t.h', + 'src/include/openssl/base.h', + 'src/include/openssl/base64.h', + 'src/include/openssl/bio.h', + 'src/include/openssl/blowfish.h', + 'src/include/openssl/bn.h', + 'src/include/openssl/buf.h', + 'src/include/openssl/buffer.h', + 'src/include/openssl/bytestring.h', + 'src/include/openssl/cast.h', + 'src/include/openssl/chacha.h', + 'src/include/openssl/cipher.h', + 'src/include/openssl/cmac.h', + 'src/include/openssl/conf.h', + 'src/include/openssl/cpu.h', + 'src/include/openssl/crypto.h', + 'src/include/openssl/curve25519.h', + 'src/include/openssl/des.h', + 'src/include/openssl/dh.h', + 'src/include/openssl/digest.h', + 'src/include/openssl/dsa.h', + 'src/include/openssl/ec.h', + 'src/include/openssl/ec_key.h', + 'src/include/openssl/ecdh.h', + 'src/include/openssl/ecdsa.h', + 'src/include/openssl/engine.h', + 'src/include/openssl/err.h', + 'src/include/openssl/evp.h', + 'src/include/openssl/ex_data.h', + 'src/include/openssl/hkdf.h', + 'src/include/openssl/hmac.h', + 'src/include/openssl/is_boringssl.h', + 'src/include/openssl/lhash.h', + 'src/include/openssl/lhash_macros.h', + 'src/include/openssl/md4.h', + 'src/include/openssl/md5.h', + 'src/include/openssl/mem.h', + 'src/include/openssl/nid.h', + 'src/include/openssl/obj.h', + 'src/include/openssl/obj_mac.h', + 'src/include/openssl/objects.h', + 'src/include/openssl/opensslconf.h', + 'src/include/openssl/opensslv.h', + 'src/include/openssl/ossl_typ.h', + 'src/include/openssl/pem.h', + 'src/include/openssl/pkcs12.h', + 'src/include/openssl/pkcs7.h', + 'src/include/openssl/pkcs8.h', + 'src/include/openssl/poly1305.h', + 'src/include/openssl/pool.h', + 'src/include/openssl/rand.h', + 'src/include/openssl/rc4.h', + 'src/include/openssl/ripemd.h', + 'src/include/openssl/rsa.h', + 'src/include/openssl/safestack.h', + 'src/include/openssl/sha.h', + 'src/include/openssl/span.h', + 'src/include/openssl/srtp.h', + 'src/include/openssl/stack.h', + 'src/include/openssl/thread.h', + 'src/include/openssl/type_check.h', + 'src/include/openssl/x509.h', + 'src/include/openssl/x509_vfy.h', + 'src/include/openssl/x509v3.h', + 'src/third_party/fiat/curve25519.c', + 'src/third_party/fiat/internal.h', + ], + 'boringssl_ios_aarch64_sources': [ + 'ios-aarch64/crypto/chacha/chacha-armv8.S', + 'ios-aarch64/crypto/fipsmodule/aesv8-armx64.S', + 'ios-aarch64/crypto/fipsmodule/armv8-mont.S', + 'ios-aarch64/crypto/fipsmodule/ghashv8-armx64.S', + 'ios-aarch64/crypto/fipsmodule/sha1-armv8.S', + 'ios-aarch64/crypto/fipsmodule/sha256-armv8.S', + 'ios-aarch64/crypto/fipsmodule/sha512-armv8.S', + ], + 'boringssl_ios_arm_sources': [ + 'ios-arm/crypto/chacha/chacha-armv4.S', + 'ios-arm/crypto/fipsmodule/aes-armv4.S', + 'ios-arm/crypto/fipsmodule/aesv8-armx32.S', + 'ios-arm/crypto/fipsmodule/armv4-mont.S', + 'ios-arm/crypto/fipsmodule/bsaes-armv7.S', + 'ios-arm/crypto/fipsmodule/ghash-armv4.S', + 'ios-arm/crypto/fipsmodule/ghashv8-armx32.S', + 'ios-arm/crypto/fipsmodule/sha1-armv4-large.S', + 'ios-arm/crypto/fipsmodule/sha256-armv4.S', + 'ios-arm/crypto/fipsmodule/sha512-armv4.S', ], 'boringssl_linux_aarch64_sources': [ - 'linux-aarch64/crypto/aes/aesv8-armx64.S', - 'linux-aarch64/crypto/bn/armv8-mont.S', 'linux-aarch64/crypto/chacha/chacha-armv8.S', - 'linux-aarch64/crypto/modes/ghashv8-armx64.S', - 'linux-aarch64/crypto/sha/sha1-armv8.S', - 'linux-aarch64/crypto/sha/sha256-armv8.S', - 'linux-aarch64/crypto/sha/sha512-armv8.S', + 'linux-aarch64/crypto/fipsmodule/aesv8-armx64.S', + 'linux-aarch64/crypto/fipsmodule/armv8-mont.S', + 'linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S', + 'linux-aarch64/crypto/fipsmodule/sha1-armv8.S', + 'linux-aarch64/crypto/fipsmodule/sha256-armv8.S', + 'linux-aarch64/crypto/fipsmodule/sha512-armv8.S', ], 'boringssl_linux_arm_sources': [ - 'linux-arm/crypto/aes/aes-armv4.S', - 'linux-arm/crypto/aes/aesv8-armx32.S', - 'linux-arm/crypto/aes/bsaes-armv7.S', - 'linux-arm/crypto/bn/armv4-mont.S', 'linux-arm/crypto/chacha/chacha-armv4.S', - 'linux-arm/crypto/modes/ghash-armv4.S', - 'linux-arm/crypto/modes/ghashv8-armx32.S', - 'linux-arm/crypto/sha/sha1-armv4-large.S', - 'linux-arm/crypto/sha/sha256-armv4.S', - 'linux-arm/crypto/sha/sha512-armv4.S', + 'linux-arm/crypto/fipsmodule/aes-armv4.S', + 'linux-arm/crypto/fipsmodule/aesv8-armx32.S', + 'linux-arm/crypto/fipsmodule/armv4-mont.S', + 'linux-arm/crypto/fipsmodule/bsaes-armv7.S', + 'linux-arm/crypto/fipsmodule/ghash-armv4.S', + 'linux-arm/crypto/fipsmodule/ghashv8-armx32.S', + 'linux-arm/crypto/fipsmodule/sha1-armv4-large.S', + 'linux-arm/crypto/fipsmodule/sha256-armv4.S', + 'linux-arm/crypto/fipsmodule/sha512-armv4.S', 'src/crypto/curve25519/asm/x25519-asm-arm.S', 'src/crypto/poly1305/poly1305_arm_asm.S', ], + 'boringssl_linux_ppc64le_sources': [ + 'linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S', + 'linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S', + ], 'boringssl_linux_x86_sources': [ - 'linux-x86/crypto/aes/aes-586.S', - 'linux-x86/crypto/aes/aesni-x86.S', - 'linux-x86/crypto/aes/vpaes-x86.S', - 'linux-x86/crypto/bn/bn-586.S', - 'linux-x86/crypto/bn/co-586.S', - 'linux-x86/crypto/bn/x86-mont.S', 'linux-x86/crypto/chacha/chacha-x86.S', - 'linux-x86/crypto/md5/md5-586.S', - 'linux-x86/crypto/modes/ghash-x86.S', - 'linux-x86/crypto/rc4/rc4-586.S', - 'linux-x86/crypto/sha/sha1-586.S', - 'linux-x86/crypto/sha/sha256-586.S', - 'linux-x86/crypto/sha/sha512-586.S', + 'linux-x86/crypto/fipsmodule/aes-586.S', + 'linux-x86/crypto/fipsmodule/aesni-x86.S', + 'linux-x86/crypto/fipsmodule/bn-586.S', + 'linux-x86/crypto/fipsmodule/co-586.S', + 'linux-x86/crypto/fipsmodule/ghash-x86.S', + 'linux-x86/crypto/fipsmodule/md5-586.S', + 'linux-x86/crypto/fipsmodule/sha1-586.S', + 'linux-x86/crypto/fipsmodule/sha256-586.S', + 'linux-x86/crypto/fipsmodule/sha512-586.S', + 'linux-x86/crypto/fipsmodule/vpaes-x86.S', + 'linux-x86/crypto/fipsmodule/x86-mont.S', ], 'boringssl_linux_x86_64_sources': [ - 'linux-x86_64/crypto/aes/aes-x86_64.S', - 'linux-x86_64/crypto/aes/aesni-x86_64.S', - 'linux-x86_64/crypto/aes/bsaes-x86_64.S', - 'linux-x86_64/crypto/aes/vpaes-x86_64.S', - 'linux-x86_64/crypto/bn/rsaz-avx2.S', - 'linux-x86_64/crypto/bn/rsaz-x86_64.S', - 'linux-x86_64/crypto/bn/x86_64-mont.S', - 'linux-x86_64/crypto/bn/x86_64-mont5.S', 'linux-x86_64/crypto/chacha/chacha-x86_64.S', - 'linux-x86_64/crypto/ec/p256-x86_64-asm.S', - 'linux-x86_64/crypto/md5/md5-x86_64.S', - 'linux-x86_64/crypto/modes/aesni-gcm-x86_64.S', - 'linux-x86_64/crypto/modes/ghash-x86_64.S', - 'linux-x86_64/crypto/rand/rdrand-x86_64.S', - 'linux-x86_64/crypto/rc4/rc4-x86_64.S', - 'linux-x86_64/crypto/sha/sha1-x86_64.S', - 'linux-x86_64/crypto/sha/sha256-x86_64.S', - 'linux-x86_64/crypto/sha/sha512-x86_64.S', + 'linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S', + 'linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S', + 'linux-x86_64/crypto/fipsmodule/aes-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/aesni-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/ghash-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/md5-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S', + 'linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/rsaz-avx2.S', + 'linux-x86_64/crypto/fipsmodule/sha1-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/sha256-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/sha512-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S', + 'linux-x86_64/crypto/fipsmodule/x86_64-mont.S', + 'linux-x86_64/crypto/fipsmodule/x86_64-mont5.S', 'src/crypto/curve25519/asm/x25519-asm-x86_64.S', ], 'boringssl_mac_x86_sources': [ - 'mac-x86/crypto/aes/aes-586.S', - 'mac-x86/crypto/aes/aesni-x86.S', - 'mac-x86/crypto/aes/vpaes-x86.S', - 'mac-x86/crypto/bn/bn-586.S', - 'mac-x86/crypto/bn/co-586.S', - 'mac-x86/crypto/bn/x86-mont.S', 'mac-x86/crypto/chacha/chacha-x86.S', - 'mac-x86/crypto/md5/md5-586.S', - 'mac-x86/crypto/modes/ghash-x86.S', - 'mac-x86/crypto/rc4/rc4-586.S', - 'mac-x86/crypto/sha/sha1-586.S', - 'mac-x86/crypto/sha/sha256-586.S', - 'mac-x86/crypto/sha/sha512-586.S', + 'mac-x86/crypto/fipsmodule/aes-586.S', + 'mac-x86/crypto/fipsmodule/aesni-x86.S', + 'mac-x86/crypto/fipsmodule/bn-586.S', + 'mac-x86/crypto/fipsmodule/co-586.S', + 'mac-x86/crypto/fipsmodule/ghash-x86.S', + 'mac-x86/crypto/fipsmodule/md5-586.S', + 'mac-x86/crypto/fipsmodule/sha1-586.S', + 'mac-x86/crypto/fipsmodule/sha256-586.S', + 'mac-x86/crypto/fipsmodule/sha512-586.S', + 'mac-x86/crypto/fipsmodule/vpaes-x86.S', + 'mac-x86/crypto/fipsmodule/x86-mont.S', ], 'boringssl_mac_x86_64_sources': [ - 'mac-x86_64/crypto/aes/aes-x86_64.S', - 'mac-x86_64/crypto/aes/aesni-x86_64.S', - 'mac-x86_64/crypto/aes/bsaes-x86_64.S', - 'mac-x86_64/crypto/aes/vpaes-x86_64.S', - 'mac-x86_64/crypto/bn/rsaz-avx2.S', - 'mac-x86_64/crypto/bn/rsaz-x86_64.S', - 'mac-x86_64/crypto/bn/x86_64-mont.S', - 'mac-x86_64/crypto/bn/x86_64-mont5.S', 'mac-x86_64/crypto/chacha/chacha-x86_64.S', - 'mac-x86_64/crypto/ec/p256-x86_64-asm.S', - 'mac-x86_64/crypto/md5/md5-x86_64.S', - 'mac-x86_64/crypto/modes/aesni-gcm-x86_64.S', - 'mac-x86_64/crypto/modes/ghash-x86_64.S', - 'mac-x86_64/crypto/rand/rdrand-x86_64.S', - 'mac-x86_64/crypto/rc4/rc4-x86_64.S', - 'mac-x86_64/crypto/sha/sha1-x86_64.S', - 'mac-x86_64/crypto/sha/sha256-x86_64.S', - 'mac-x86_64/crypto/sha/sha512-x86_64.S', + 'mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S', + 'mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S', + 'mac-x86_64/crypto/fipsmodule/aes-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/aesni-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/ghash-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/md5-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S', + 'mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/rsaz-avx2.S', + 'mac-x86_64/crypto/fipsmodule/sha1-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/sha256-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/sha512-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S', + 'mac-x86_64/crypto/fipsmodule/x86_64-mont.S', + 'mac-x86_64/crypto/fipsmodule/x86_64-mont5.S', 'src/crypto/curve25519/asm/x25519-asm-x86_64.S', ], 'boringssl_win_x86_sources': [ - 'win-x86/crypto/aes/aes-586.asm', - 'win-x86/crypto/aes/aesni-x86.asm', - 'win-x86/crypto/aes/vpaes-x86.asm', - 'win-x86/crypto/bn/bn-586.asm', - 'win-x86/crypto/bn/co-586.asm', - 'win-x86/crypto/bn/x86-mont.asm', 'win-x86/crypto/chacha/chacha-x86.asm', - 'win-x86/crypto/md5/md5-586.asm', - 'win-x86/crypto/modes/ghash-x86.asm', - 'win-x86/crypto/rc4/rc4-586.asm', - 'win-x86/crypto/sha/sha1-586.asm', - 'win-x86/crypto/sha/sha256-586.asm', - 'win-x86/crypto/sha/sha512-586.asm', + 'win-x86/crypto/fipsmodule/aes-586.asm', + 'win-x86/crypto/fipsmodule/aesni-x86.asm', + 'win-x86/crypto/fipsmodule/bn-586.asm', + 'win-x86/crypto/fipsmodule/co-586.asm', + 'win-x86/crypto/fipsmodule/ghash-x86.asm', + 'win-x86/crypto/fipsmodule/md5-586.asm', + 'win-x86/crypto/fipsmodule/sha1-586.asm', + 'win-x86/crypto/fipsmodule/sha256-586.asm', + 'win-x86/crypto/fipsmodule/sha512-586.asm', + 'win-x86/crypto/fipsmodule/vpaes-x86.asm', + 'win-x86/crypto/fipsmodule/x86-mont.asm', ], 'boringssl_win_x86_64_sources': [ - 'win-x86_64/crypto/aes/aes-x86_64.asm', - 'win-x86_64/crypto/aes/aesni-x86_64.asm', - 'win-x86_64/crypto/aes/bsaes-x86_64.asm', - 'win-x86_64/crypto/aes/vpaes-x86_64.asm', - 'win-x86_64/crypto/bn/rsaz-avx2.asm', - 'win-x86_64/crypto/bn/rsaz-x86_64.asm', - 'win-x86_64/crypto/bn/x86_64-mont.asm', - 'win-x86_64/crypto/bn/x86_64-mont5.asm', 'win-x86_64/crypto/chacha/chacha-x86_64.asm', - 'win-x86_64/crypto/ec/p256-x86_64-asm.asm', - 'win-x86_64/crypto/md5/md5-x86_64.asm', - 'win-x86_64/crypto/modes/aesni-gcm-x86_64.asm', - 'win-x86_64/crypto/modes/ghash-x86_64.asm', - 'win-x86_64/crypto/rand/rdrand-x86_64.asm', - 'win-x86_64/crypto/rc4/rc4-x86_64.asm', - 'win-x86_64/crypto/sha/sha1-x86_64.asm', - 'win-x86_64/crypto/sha/sha256-x86_64.asm', - 'win-x86_64/crypto/sha/sha512-x86_64.asm', + 'win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm', + 'win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm', + 'win-x86_64/crypto/fipsmodule/aes-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/aesni-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/ghash-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/md5-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm', + 'win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/rsaz-avx2.asm', + 'win-x86_64/crypto/fipsmodule/sha1-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/sha256-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/sha512-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm', + 'win-x86_64/crypto/fipsmodule/x86_64-mont.asm', + 'win-x86_64/crypto/fipsmodule/x86_64-mont5.asm', ], } } diff --git a/packager/third_party/boringssl/boringssl_tests.gypi b/packager/third_party/boringssl/boringssl_tests.gypi deleted file mode 100644 index 8542e5c20a..0000000000 --- a/packager/third_party/boringssl/boringssl_tests.gypi +++ /dev/null @@ -1,686 +0,0 @@ -# Copyright (c) 2016 The Chromium Authors. All rights reserved. -# Use of this source code is governed by a BSD-style license that can be -# found in the LICENSE file. - -# This file is created by generate_build_files.py. Do not edit manually. - -{ - 'targets': [ - { - 'target_name': 'boringssl_aes_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/aes/aes_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_asn1_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/asn1/asn1_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_base64_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/base64/base64_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_bio_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/bio/bio_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_bn_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/bn/bn_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_bytestring_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/bytestring/bytestring_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_chacha_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/chacha/chacha_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_aead_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/cipher/aead_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_cipher_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/cipher/cipher_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_cmac_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/cmac/cmac_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_constant_time_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/constant_time_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_ed25519_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/curve25519/ed25519_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_spake25519_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/curve25519/spake25519_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_x25519_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/curve25519/x25519_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_dh_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/dh/dh_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_digest_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/digest/digest_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_dsa_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/dsa/dsa_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_ec_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/ec/ec_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_example_mul', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/ec/example_mul.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_ecdsa_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/ecdsa/ecdsa_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_err_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/err/err_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_evp_extra_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/evp/evp_extra_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_evp_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/evp/evp_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_pbkdf_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/evp/pbkdf_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_hkdf_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/hkdf/hkdf_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_hmac_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/hmac/hmac_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_lhash_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/lhash/lhash_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_gcm_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/modes/gcm_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_newhope_statistical_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/newhope/newhope_statistical_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_newhope_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/newhope/newhope_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_newhope_vectors_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/newhope/newhope_vectors_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_obj_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/obj/obj_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_pkcs12_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/pkcs8/pkcs12_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_pkcs8_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/pkcs8/pkcs8_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_poly1305_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/poly1305/poly1305_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_refcount_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/refcount_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_rsa_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/rsa/rsa_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_thread_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/thread_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_pkcs7_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/x509/pkcs7_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_x509_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/x509/x509_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_tab_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/x509v3/tab_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_v3name_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/crypto/x509v3/v3name_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_pqueue_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/ssl/pqueue/pqueue_test.c', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - { - 'target_name': 'boringssl_ssl_test', - 'type': 'executable', - 'dependencies': [ - 'boringssl.gyp:boringssl', - ], - 'sources': [ - 'src/ssl/ssl_test.cc', - '<@(boringssl_test_support_sources)', - ], - # TODO(davidben): Fix size_t truncations in BoringSSL. - # https://crbug.com/429039 - 'msvs_disabled_warnings': [ 4267, ], - }, - ], - 'variables': { - 'boringssl_test_support_sources': [ - 'src/crypto/test/file_test.cc', - 'src/crypto/test/file_test.h', - 'src/crypto/test/malloc.cc', - 'src/crypto/test/scoped_types.h', - 'src/crypto/test/test_util.cc', - 'src/crypto/test/test_util.h', - 'src/ssl/test/async_bio.h', - 'src/ssl/test/packeted_bio.h', - 'src/ssl/test/scoped_types.h', - 'src/ssl/test/test_config.h', - ], - 'boringssl_test_targets': [ - 'boringssl_aead_test', - 'boringssl_aes_test', - 'boringssl_asn1_test', - 'boringssl_base64_test', - 'boringssl_bio_test', - 'boringssl_bn_test', - 'boringssl_bytestring_test', - 'boringssl_chacha_test', - 'boringssl_cipher_test', - 'boringssl_cmac_test', - 'boringssl_constant_time_test', - 'boringssl_dh_test', - 'boringssl_digest_test', - 'boringssl_dsa_test', - 'boringssl_ec_test', - 'boringssl_ecdsa_test', - 'boringssl_ed25519_test', - 'boringssl_err_test', - 'boringssl_evp_extra_test', - 'boringssl_evp_test', - 'boringssl_example_mul', - 'boringssl_gcm_test', - 'boringssl_hkdf_test', - 'boringssl_hmac_test', - 'boringssl_lhash_test', - 'boringssl_newhope_statistical_test', - 'boringssl_newhope_test', - 'boringssl_newhope_vectors_test', - 'boringssl_obj_test', - 'boringssl_pbkdf_test', - 'boringssl_pkcs12_test', - 'boringssl_pkcs7_test', - 'boringssl_pkcs8_test', - 'boringssl_poly1305_test', - 'boringssl_pqueue_test', - 'boringssl_refcount_test', - 'boringssl_rsa_test', - 'boringssl_spake25519_test', - 'boringssl_ssl_test', - 'boringssl_tab_test', - 'boringssl_thread_test', - 'boringssl_v3name_test', - 'boringssl_x25519_test', - 'boringssl_x509_test', - ], - } -} diff --git a/packager/third_party/boringssl/err_data.c b/packager/third_party/boringssl/err_data.c index 32976ad424..931a44b643 100644 --- a/packager/third_party/boringssl/err_data.c +++ b/packager/third_party/boringssl/err_data.c @@ -53,6 +53,7 @@ OPENSSL_COMPILE_ASSERT(ERR_LIB_HKDF == 31, library_values_changed_31); OPENSSL_COMPILE_ASSERT(ERR_LIB_USER == 32, library_values_changed_32); OPENSSL_COMPILE_ASSERT(ERR_NUM_LIBS == 33, library_values_changed_num); +// clang-format off const uint32_t kOpenSSLReasonValues[] = { 0xc320838, 0xc328852, @@ -62,158 +63,166 @@ const uint32_t kOpenSSLReasonValues[] = { 0xc348899, 0xc3508a5, 0xc3588c2, - 0xc3608d4, - 0xc3688e2, - 0xc3708f2, - 0xc3788ff, - 0xc38090f, - 0xc38891a, - 0xc390930, - 0xc39893f, - 0xc3a0953, + 0xc3608e2, + 0xc3688f0, + 0xc370900, + 0xc37890d, + 0xc38091d, + 0xc388928, + 0xc39093e, + 0xc39894d, + 0xc3a0961, 0xc3a8845, 0xc3b00ea, + 0xc3b88d4, 0x10320845, - 0x1032939a, - 0x103313a6, - 0x103393bf, - 0x103413d2, - 0x10348e7a, - 0x10350c19, - 0x103593e5, - 0x103613fa, - 0x1036940d, - 0x1037142c, - 0x10379445, - 0x1038145a, - 0x10389478, - 0x10391487, - 0x103994a3, - 0x103a14be, - 0x103a94cd, - 0x103b14e9, - 0x103b9504, - 0x103c151b, + 0x10329535, + 0x10331541, + 0x1033955a, + 0x1034156d, + 0x10348efc, + 0x10350c5e, + 0x10359580, + 0x10361595, + 0x103695a8, + 0x103715c7, + 0x103795e0, + 0x103815f5, + 0x10389613, + 0x10391622, + 0x1039963e, + 0x103a1659, + 0x103a9668, + 0x103b1684, + 0x103b969f, + 0x103c16b6, 0x103c80ea, - 0x103d152c, - 0x103d9540, - 0x103e155f, - 0x103e956e, - 0x103f1585, - 0x103f9598, - 0x10400bea, - 0x104095ab, - 0x104115c9, - 0x104195dc, - 0x104215f6, - 0x10429606, - 0x1043161a, - 0x10439630, - 0x10441648, - 0x1044965d, - 0x10451671, - 0x10459683, + 0x103d16c7, + 0x103d96db, + 0x103e16fa, + 0x103e9709, + 0x103f1720, + 0x103f9733, + 0x10400c22, + 0x10409746, + 0x10411764, + 0x10419777, + 0x10421791, + 0x104297a1, + 0x104317b5, + 0x104397cb, + 0x104417e3, + 0x104497f8, + 0x1045180c, + 0x1045981e, 0x104605fb, - 0x1046893f, - 0x10471698, - 0x104796af, - 0x104816c4, - 0x104896d2, - 0x14320bcd, - 0x14328bdb, - 0x14330bea, - 0x14338bfc, + 0x1046894d, + 0x10471833, + 0x1047984a, + 0x1048185f, + 0x1048986d, + 0x10490e5e, + 0x14320c05, + 0x14328c13, + 0x14330c22, + 0x14338c34, 0x143400ac, 0x143480ea, 0x18320083, - 0x18328ed0, + 0x18328f52, 0x183300ac, - 0x18338ee6, - 0x18340efa, + 0x18338f68, + 0x18340f7c, 0x183480ea, - 0x18350f0f, - 0x18358f27, - 0x18360f3c, - 0x18368f50, - 0x18370f74, - 0x18378f8a, - 0x18380f9e, - 0x18388fae, - 0x18390a57, - 0x18398fbe, - 0x183a0fd3, - 0x183a8fe7, - 0x183b0c25, - 0x183b8ff4, - 0x183c1006, - 0x183c9011, - 0x183d1021, - 0x183d9032, - 0x183e1043, - 0x183e9055, - 0x183f107e, - 0x183f9097, - 0x184010af, + 0x18350f91, + 0x18358fa9, + 0x18360fbe, + 0x18368fd2, + 0x18370ff6, + 0x1837900c, + 0x18381020, + 0x18389030, + 0x18390a73, + 0x18399040, + 0x183a1068, + 0x183a908e, + 0x183b0c6a, + 0x183b90c3, + 0x183c10d5, + 0x183c90e0, + 0x183d10f0, + 0x183d9101, + 0x183e1112, + 0x183e9124, + 0x183f114d, + 0x183f9166, + 0x1840117e, 0x184086d3, - 0x203210d6, - 0x243210e2, - 0x24328985, - 0x243310f4, - 0x24339101, - 0x2434110e, - 0x24349120, - 0x2435112f, - 0x2435914c, - 0x24361159, - 0x24369167, - 0x24371175, - 0x24379183, - 0x2438118c, - 0x24389199, - 0x243911ac, - 0x28320c0d, - 0x28328c25, - 0x28330bea, - 0x28338c38, - 0x28340c19, + 0x184110b1, + 0x1841907c, + 0x1842109b, + 0x18429055, + 0x203211b8, + 0x203291a5, + 0x243211c4, + 0x24328993, + 0x243311d6, + 0x243391e3, + 0x243411f0, + 0x24349202, + 0x24351211, + 0x2435922e, + 0x2436123b, + 0x24369249, + 0x24371257, + 0x24379265, + 0x2438126e, + 0x2438927b, + 0x2439128e, + 0x28320c52, + 0x28328c6a, + 0x28330c22, + 0x28338c7d, + 0x28340c5e, 0x283480ac, 0x283500ea, - 0x2c32274a, - 0x2c32a758, - 0x2c33276a, - 0x2c33a77c, - 0x2c342790, - 0x2c34a7a2, - 0x2c3527bd, - 0x2c35a7cf, - 0x2c3627e2, + 0x2c322c6c, + 0x2c3292a5, + 0x2c332c7a, + 0x2c33ac8c, + 0x2c342ca0, + 0x2c34acb2, + 0x2c352ccd, + 0x2c35acdf, + 0x2c362cf2, 0x2c36832d, - 0x2c3727ef, - 0x2c37a801, - 0x2c382814, - 0x2c38a82b, - 0x2c392839, - 0x2c39a849, - 0x2c3a285b, - 0x2c3aa86f, - 0x2c3b2880, - 0x2c3ba89f, - 0x2c3c28b3, - 0x2c3ca8c9, - 0x2c3d28e2, - 0x2c3da8ff, - 0x2c3e2910, - 0x2c3ea91e, - 0x2c3f2936, - 0x2c3fa94e, - 0x2c40295b, - 0x2c4090d6, - 0x2c41296c, - 0x2c41a97f, - 0x2c4210af, - 0x2c42a990, + 0x2c372cff, + 0x2c37ad11, + 0x2c382d36, + 0x2c38ad4d, + 0x2c392d5b, + 0x2c39ad6b, + 0x2c3a2d7d, + 0x2c3aad91, + 0x2c3b2da2, + 0x2c3badc1, + 0x2c3c12b7, + 0x2c3c92cd, + 0x2c3d2dd5, + 0x2c3d92e6, + 0x2c3e2df2, + 0x2c3eae00, + 0x2c3f2e18, + 0x2c3fae30, + 0x2c402e3d, + 0x2c4091b8, + 0x2c412e4e, + 0x2c41ae61, + 0x2c42117e, + 0x2c42ae72, 0x2c430720, - 0x2c43a891, + 0x2c43adb3, + 0x2c442d24, 0x30320000, 0x30328015, 0x3033001f, @@ -306,224 +315,262 @@ const uint32_t kOpenSSLReasonValues[] = { 0x305e8700, 0x305f0716, 0x305f8720, - 0x34320b47, - 0x34328b5b, - 0x34330b78, - 0x34338b8b, - 0x34340b9a, - 0x34348bb7, + 0x34320b63, + 0x34328b77, + 0x34330b94, + 0x34338ba7, + 0x34340bb6, + 0x34348bef, + 0x34350bd3, 0x3c320083, - 0x3c328c62, - 0x3c330c7b, - 0x3c338c96, - 0x3c340cb3, - 0x3c348cdd, - 0x3c350cf8, - 0x3c358d0d, - 0x3c360d26, - 0x3c368d3e, - 0x3c370d4f, - 0x3c378d5d, - 0x3c380d6a, - 0x3c388d7e, - 0x3c390c25, - 0x3c398d92, - 0x3c3a0da6, - 0x3c3a88ff, - 0x3c3b0db6, - 0x3c3b8dd1, - 0x3c3c0de3, - 0x3c3c8df9, - 0x3c3d0e03, - 0x3c3d8e17, - 0x3c3e0e25, - 0x3c3e8e4a, - 0x3c3f0c4e, - 0x3c3f8e33, + 0x3c328ca7, + 0x3c330cc0, + 0x3c338cdb, + 0x3c340cf8, + 0x3c348d22, + 0x3c350d3d, + 0x3c358d63, + 0x3c360d7c, + 0x3c368d94, + 0x3c370da5, + 0x3c378db3, + 0x3c380dc0, + 0x3c388dd4, + 0x3c390c6a, + 0x3c398df7, + 0x3c3a0e0b, + 0x3c3a890d, + 0x3c3b0e1b, + 0x3c3b8e36, + 0x3c3c0e48, + 0x3c3c8e7b, + 0x3c3d0e85, + 0x3c3d8e99, + 0x3c3e0ea7, + 0x3c3e8ecc, + 0x3c3f0c93, + 0x3c3f8eb5, 0x3c4000ac, 0x3c4080ea, - 0x3c410cce, - 0x403216e9, - 0x403296ff, - 0x4033172d, - 0x40339737, - 0x4034174e, - 0x4034976c, - 0x4035177c, - 0x4035978e, - 0x4036179b, - 0x403697a7, - 0x403717bc, - 0x403797ce, - 0x403817d9, - 0x403897eb, - 0x40390e7a, - 0x403997fb, - 0x403a180e, - 0x403a982f, - 0x403b1840, - 0x403b9850, + 0x3c410d13, + 0x3c418d52, + 0x3c420e5e, + 0x3c428de8, + 0x403218c6, + 0x403298dc, + 0x4033190a, + 0x40339914, + 0x4034192b, + 0x40349949, + 0x40351959, + 0x4035996b, + 0x40361978, + 0x40369984, + 0x40371999, + 0x403799ab, + 0x403819b6, + 0x403899c8, + 0x40390efc, + 0x403999d8, + 0x403a19eb, + 0x403a9a0c, + 0x403b1a1d, + 0x403b9a2d, 0x403c0064, 0x403c8083, - 0x403d185c, - 0x403d9872, - 0x403e1881, - 0x403e9894, - 0x403f18ae, - 0x403f98bc, - 0x404018d1, - 0x404098e5, - 0x40411902, - 0x4041991d, - 0x40421936, - 0x40429949, - 0x4043195d, - 0x40439975, - 0x4044198c, + 0x403d1ab1, + 0x403d9ac7, + 0x403e1ad6, + 0x403e9b0e, + 0x403f1b28, + 0x403f9b36, + 0x40401b4b, + 0x40409b5f, + 0x40411b7c, + 0x40419b97, + 0x40421bb0, + 0x40429bc3, + 0x40431bd7, + 0x40439bef, + 0x40441c06, 0x404480ac, - 0x404519a1, - 0x404599b3, - 0x404619d7, - 0x404699f7, - 0x40471a05, - 0x40479a19, - 0x40481a2e, - 0x40489a47, - 0x40491a5e, - 0x40499a78, - 0x404a1a8f, - 0x404a9aad, - 0x404b1ac5, - 0x404b9adc, - 0x404c1af2, - 0x404c9b04, - 0x404d1b25, - 0x404d9b47, - 0x404e1b5b, - 0x404e9b68, - 0x404f1b7f, - 0x404f9b8f, - 0x40501b9f, - 0x40509bb3, - 0x40511bce, - 0x40519bde, - 0x40521bf5, - 0x40529c07, - 0x40531c1f, - 0x40539c32, - 0x40541c47, - 0x40549c6a, - 0x40551c78, - 0x40559c95, - 0x40561ca2, - 0x40569cbb, - 0x40571cd3, - 0x40579ce6, - 0x40581cfb, - 0x40589d0d, - 0x40591d1d, - 0x40599d36, - 0x405a1d4a, - 0x405a9d5a, - 0x405b1d72, - 0x405b9d83, - 0x405c1d96, - 0x405c9da7, - 0x405d1db4, - 0x405d9dcb, - 0x405e1deb, - 0x405e8a95, - 0x405f1e0c, - 0x405f9e19, - 0x40601e27, - 0x40609e49, - 0x40611e71, - 0x40619e86, - 0x40621e9d, - 0x40629eae, - 0x40631ebf, - 0x40639ed4, - 0x40641eeb, - 0x40649efc, - 0x40651f17, - 0x40659f2e, - 0x40661f46, - 0x40669f70, - 0x40671f9b, - 0x40679fbc, - 0x40681fcf, - 0x40689ff0, - 0x40692022, - 0x4069a050, - 0x406a2071, - 0x406aa091, - 0x406b2219, - 0x406ba23c, - 0x406c2252, - 0x406ca47e, - 0x406d24ad, - 0x406da4d5, - 0x406e24ee, - 0x406ea506, - 0x406f2525, - 0x406fa53a, - 0x4070254d, - 0x4070a56a, + 0x40451c1b, + 0x40459c2d, + 0x40461c51, + 0x40469c71, + 0x40471c7f, + 0x40479ca6, + 0x40481ce3, + 0x40489d16, + 0x40491d2d, + 0x40499d47, + 0x404a1d5e, + 0x404a9d7c, + 0x404b1d94, + 0x404b9dab, + 0x404c1dc1, + 0x404c9dd3, + 0x404d1df4, + 0x404d9e16, + 0x404e1e2a, + 0x404e9e37, + 0x404f1e64, + 0x404f9e8d, + 0x40501ec8, + 0x40509edc, + 0x40511ef7, + 0x40521f07, + 0x40529f2b, + 0x40531f43, + 0x40539f56, + 0x40541f6b, + 0x40549f8e, + 0x40551f9c, + 0x40559fb9, + 0x40561fc6, + 0x40569fdf, + 0x40571ff7, + 0x4057a00a, + 0x4058201f, + 0x4058a046, + 0x40592075, + 0x4059a0a2, + 0x405a20b6, + 0x405aa0c6, + 0x405b20de, + 0x405ba0ef, + 0x405c2102, + 0x405ca141, + 0x405d214e, + 0x405da165, + 0x405e21a3, + 0x405e8ab1, + 0x405f21c4, + 0x405fa1d1, + 0x406021df, + 0x4060a201, + 0x40612245, + 0x4061a27d, + 0x40622294, + 0x4062a2a5, + 0x406322b6, + 0x4063a2cb, + 0x406422e2, + 0x4064a30e, + 0x40652329, + 0x4065a340, + 0x40662358, + 0x4066a382, + 0x406723ad, + 0x4067a3ce, + 0x406823f5, + 0x4068a416, + 0x40692448, + 0x4069a476, + 0x406a2497, + 0x406aa4b7, + 0x406b263f, + 0x406ba662, + 0x406c2678, + 0x406ca8f3, + 0x406d2922, + 0x406da94a, + 0x406e2978, + 0x406ea9c5, + 0x406f29e4, + 0x406faa1c, + 0x40702a2f, + 0x4070aa4c, 0x40710800, - 0x4071a57c, - 0x4072258f, - 0x4072a5a8, - 0x407325c0, - 0x4073935c, - 0x407425d4, - 0x4074a5ee, - 0x407525ff, - 0x4075a613, - 0x40762621, - 0x40769199, - 0x40772646, - 0x4077a668, - 0x40782683, - 0x4078a698, - 0x407926af, - 0x4079a6c5, - 0x407a26d1, - 0x407aa6e4, - 0x407b26f9, - 0x407ba70b, - 0x407c2720, - 0x407ca729, - 0x407d200b, - 0x41f42144, - 0x41f921d6, - 0x41fe20c9, - 0x41fea2a5, - 0x41ff2396, - 0x4203215d, - 0x4208217f, - 0x4208a1bb, - 0x420920ad, - 0x4209a1f5, - 0x420a2104, - 0x420aa0e4, - 0x420b2124, - 0x420ba19d, - 0x420c23b2, - 0x420ca272, - 0x420d228c, - 0x420da2c3, - 0x421222dd, - 0x42172379, - 0x4217a31f, - 0x421c2341, - 0x421f22fc, - 0x422123c9, - 0x4226235c, - 0x422b2462, - 0x422ba42b, - 0x422c244a, - 0x422ca405, - 0x422d23e4, + 0x4071aa5e, + 0x40722a71, + 0x4072aa8a, + 0x40732aa2, + 0x407394a4, + 0x40742ab6, + 0x4074aad0, + 0x40752ae1, + 0x4075aaf5, + 0x40762b03, + 0x4076927b, + 0x40772b28, + 0x4077ab4a, + 0x40782b65, + 0x4078ab9e, + 0x40792bb5, + 0x4079abcb, + 0x407a2bd7, + 0x407aabea, + 0x407b2bff, + 0x407bac11, + 0x407c2c42, + 0x407cac4b, + 0x407d2431, + 0x407d9e9d, + 0x407e2b7a, + 0x407ea056, + 0x407f1c93, + 0x407f9a53, + 0x40801e74, + 0x40809cbb, + 0x40811f19, + 0x40819e4e, + 0x40822963, + 0x40829a39, + 0x40832031, + 0x4083a2f3, + 0x40841ccf, + 0x4084a08e, + 0x40852113, + 0x4085a229, + 0x40862185, + 0x40869eb7, + 0x408729a9, + 0x4087a25a, + 0x40881a9a, + 0x4088a3e1, + 0x40891ae9, + 0x40899a76, + 0x408a2698, + 0x408a9884, + 0x408b2c26, + 0x408ba9f9, + 0x408c2123, + 0x408c98a0, + 0x408d1cfc, + 0x41f4256a, + 0x41f925fc, + 0x41fe24ef, + 0x41fea6e4, + 0x41ff27d5, + 0x42032583, + 0x420825a5, + 0x4208a5e1, + 0x420924d3, + 0x4209a61b, + 0x420a252a, + 0x420aa50a, + 0x420b254a, + 0x420ba5c3, + 0x420c27f1, + 0x420ca6b1, + 0x420d26cb, + 0x420da702, + 0x4212271c, + 0x421727b8, + 0x4217a75e, + 0x421c2780, + 0x421f273b, + 0x42212808, + 0x4226279b, + 0x422b28d7, + 0x422ba885, + 0x422c28bf, + 0x422ca844, + 0x422d2823, + 0x422da8a4, + 0x422e286a, + 0x422ea990, 0x4432072b, 0x4432873a, 0x44330746, @@ -541,132 +588,146 @@ const uint32_t kOpenSSLReasonValues[] = { 0x44390800, 0x4439880e, 0x443a0821, - 0x4c3211c3, - 0x4c3291d3, - 0x4c3311e6, - 0x4c339206, + 0x483212a5, + 0x483292b7, + 0x483312cd, + 0x483392e6, + 0x4c32130b, + 0x4c32931b, + 0x4c33132e, + 0x4c33934e, 0x4c3400ac, 0x4c3480ea, - 0x4c351212, - 0x4c359220, - 0x4c36123c, - 0x4c36924f, - 0x4c37125e, - 0x4c37926c, - 0x4c381281, - 0x4c38928d, - 0x4c3912ad, - 0x4c3992d7, - 0x4c3a12f0, - 0x4c3a9309, + 0x4c35135a, + 0x4c359368, + 0x4c361384, + 0x4c369397, + 0x4c3713a6, + 0x4c3793b4, + 0x4c3813c9, + 0x4c3893d5, + 0x4c3913f5, + 0x4c39941f, + 0x4c3a1438, + 0x4c3a9451, 0x4c3b05fb, - 0x4c3b9322, - 0x4c3c1334, - 0x4c3c9343, - 0x4c3d135c, - 0x4c3d936b, - 0x4c3e1378, - 0x503229a2, - 0x5032a9b1, - 0x503329bc, - 0x5033a9cc, - 0x503429e5, - 0x5034a9ff, - 0x50352a0d, - 0x5035aa23, - 0x50362a35, - 0x5036aa4b, - 0x50372a64, - 0x5037aa77, - 0x50382a8f, - 0x5038aaa0, - 0x50392ab5, - 0x5039aac9, - 0x503a2ae9, - 0x503aaaff, - 0x503b2b17, - 0x503bab29, - 0x503c2b45, - 0x503cab5c, - 0x503d2b75, - 0x503dab8b, - 0x503e2b98, - 0x503eabae, - 0x503f2bc0, + 0x4c3b946a, + 0x4c3c147c, + 0x4c3c948b, + 0x4c3d14a4, + 0x4c3d8c45, + 0x4c3e14fd, + 0x4c3e94b3, + 0x4c3f151f, + 0x4c3f927b, + 0x4c4014c9, + 0x4c4092f7, + 0x4c4114ed, + 0x50322e84, + 0x5032ae93, + 0x50332e9e, + 0x5033aeae, + 0x50342ec7, + 0x5034aee1, + 0x50352eef, + 0x5035af05, + 0x50362f17, + 0x5036af2d, + 0x50372f46, + 0x5037af59, + 0x50382f71, + 0x5038af82, + 0x50392f97, + 0x5039afab, + 0x503a2fcb, + 0x503aafe1, + 0x503b2ff9, + 0x503bb00b, + 0x503c3027, + 0x503cb03e, + 0x503d3057, + 0x503db06d, + 0x503e307a, + 0x503eb090, + 0x503f30a2, 0x503f8382, - 0x50402bd3, - 0x5040abe3, - 0x50412bfd, - 0x5041ac0c, - 0x50422c26, - 0x5042ac43, - 0x50432c53, - 0x5043ac63, - 0x50442c72, + 0x504030b5, + 0x5040b0c5, + 0x504130df, + 0x5041b0ee, + 0x50423108, + 0x5042b125, + 0x50433135, + 0x5043b145, + 0x50443154, 0x5044843f, - 0x50452c86, - 0x5045aca4, - 0x50462cb7, - 0x5046accd, - 0x50472cdf, - 0x5047acf4, - 0x50482d1a, - 0x5048ad28, - 0x50492d3b, - 0x5049ad50, - 0x504a2d66, - 0x504aad76, - 0x504b2d96, - 0x504bada9, - 0x504c2dcc, - 0x504cadfa, - 0x504d2e0c, - 0x504dae29, - 0x504e2e44, - 0x504eae60, - 0x504f2e72, - 0x504fae89, - 0x50502e98, + 0x50453168, + 0x5045b186, + 0x50463199, + 0x5046b1af, + 0x504731c1, + 0x5047b1d6, + 0x504831fc, + 0x5048b20a, + 0x5049321d, + 0x5049b232, + 0x504a3248, + 0x504ab258, + 0x504b3278, + 0x504bb28b, + 0x504c32ae, + 0x504cb2dc, + 0x504d32ee, + 0x504db30b, + 0x504e3326, + 0x504eb342, + 0x504f3354, + 0x504fb36b, + 0x5050337a, 0x505086ef, - 0x50512eab, - 0x58320eb8, - 0x68320e7a, - 0x68328c25, - 0x68330c38, - 0x68338e88, - 0x68340e98, + 0x5051338d, + 0x58320f3a, + 0x68320efc, + 0x68328c6a, + 0x68330c7d, + 0x68338f0a, + 0x68340f1a, 0x683480ea, - 0x6c320e56, - 0x6c328bfc, - 0x6c330e61, - 0x74320a0b, - 0x78320970, - 0x78328985, - 0x78330991, + 0x6c320ed8, + 0x6c328c34, + 0x6c330ee3, + 0x74320a19, + 0x743280ac, + 0x74330c45, + 0x7832097e, + 0x78328993, + 0x7833099f, 0x78338083, - 0x783409a0, - 0x783489b5, - 0x783509d4, - 0x783589f6, - 0x78360a0b, - 0x78368a21, - 0x78370a31, - 0x78378a44, - 0x78380a57, - 0x78388a69, - 0x78390a76, - 0x78398a95, - 0x783a0aaa, - 0x783a8ab8, - 0x783b0ac2, - 0x783b8ad6, - 0x783c0aed, - 0x783c8b02, - 0x783d0b19, - 0x783d8b2e, - 0x783e0a84, - 0x7c3210c5, + 0x783409ae, + 0x783489c3, + 0x783509e2, + 0x78358a04, + 0x78360a19, + 0x78368a2f, + 0x78370a3f, + 0x78378a60, + 0x78380a73, + 0x78388a85, + 0x78390a92, + 0x78398ab1, + 0x783a0ac6, + 0x783a8ad4, + 0x783b0ade, + 0x783b8af2, + 0x783c0b09, + 0x783c8b1e, + 0x783d0b35, + 0x783d8b4a, + 0x783e0aa0, + 0x783e8a52, + 0x7c321194, }; +// clang-format on const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]); @@ -789,6 +850,7 @@ const char kOpenSSLReasonStringData[] = "DIV_BY_ZERO\0" "EXPAND_ON_STATIC_BIGNUM_DATA\0" "INPUT_NOT_REDUCED\0" + "INVALID_INPUT\0" "INVALID_RANGE\0" "NEGATIVE_NUMBER\0" "NOT_A_SQUARE\0" @@ -808,6 +870,7 @@ const char kOpenSSLReasonStringData[] = "INPUT_NOT_INITIALIZED\0" "INVALID_AD_SIZE\0" "INVALID_KEY_LENGTH\0" + "INVALID_NONCE\0" "INVALID_NONCE_SIZE\0" "INVALID_OPERATION\0" "IV_TOO_LARGE\0" @@ -827,11 +890,13 @@ const char kOpenSSLReasonStringData[] = "MISSING_EQUAL_SIGN\0" "NO_CLOSE_BRACE\0" "UNABLE_TO_CREATE_NEW_SECTION\0" + "VARIABLE_EXPANSION_TOO_LONG\0" "VARIABLE_HAS_NO_VALUE\0" "BAD_GENERATOR\0" "INVALID_PUBKEY\0" "MODULUS_TOO_LARGE\0" "NO_PRIVATE_VALUE\0" + "UNKNOWN_HASH\0" "BAD_Q_VALUE\0" "BAD_VERSION\0" "MISSING_PARAMETERS\0" @@ -844,6 +909,7 @@ const char kOpenSSLReasonStringData[] = "GROUP_MISMATCH\0" "I2D_ECPKPARAMETERS_FAILURE\0" "INCOMPATIBLE_OBJECTS\0" + "INVALID_COFACTOR\0" "INVALID_COMPRESSED_POINT\0" "INVALID_COMPRESSION_BIT\0" "INVALID_ENCODING\0" @@ -851,11 +917,13 @@ const char kOpenSSLReasonStringData[] = "INVALID_FORM\0" "INVALID_GROUP_ORDER\0" "INVALID_PRIVATE_KEY\0" + "INVALID_SCALAR\0" "MISSING_PRIVATE_KEY\0" "NON_NAMED_CURVE\0" "PKPARAMETERS2GROUP_FAILURE\0" "POINT_AT_INFINITY\0" "POINT_IS_NOT_ON_CURVE\0" + "PUBLIC_KEY_VALIDATION_FAILED\0" "SLOT_FULL\0" "UNDEFINED_GENERATOR\0" "UNKNOWN_GROUP\0" @@ -880,8 +948,12 @@ const char kOpenSSLReasonStringData[] = "INVALID_KEYBITS\0" "INVALID_MGF1_MD\0" "INVALID_PADDING_MODE\0" + "INVALID_PARAMETERS\0" "INVALID_PSS_SALTLEN\0" + "INVALID_SIGNATURE\0" "KEYS_NOT_SET\0" + "MEMORY_LIMIT_EXCEEDED\0" + "NOT_A_PRIVATE_KEY\0" "NO_DEFAULT_DIGEST\0" "NO_KEY_SET\0" "NO_MDC2_SUPPORT\0" @@ -893,6 +965,7 @@ const char kOpenSSLReasonStringData[] = "UNKNOWN_PUBLIC_KEY_TYPE\0" "UNSUPPORTED_ALGORITHM\0" "OUTPUT_TOO_LARGE\0" + "INVALID_OID_STRING\0" "UNKNOWN_NID\0" "BAD_BASE64_DECODE\0" "BAD_END_LINE\0" @@ -908,6 +981,11 @@ const char kOpenSSLReasonStringData[] = "SHORT_HEADER\0" "UNSUPPORTED_CIPHER\0" "UNSUPPORTED_ENCRYPTION\0" + "BAD_PKCS7_VERSION\0" + "NOT_PKCS7_SIGNED_DATA\0" + "NO_CERTIFICATES_INCLUDED\0" + "NO_CRLS_INCLUDED\0" + "BAD_ITERATION_COUNT\0" "BAD_PKCS12_DATA\0" "BAD_PKCS12_VERSION\0" "CIPHER_HAS_NO_OBJECT_IDENTIFIER\0" @@ -928,8 +1006,11 @@ const char kOpenSSLReasonStringData[] = "UNKNOWN_CIPHER\0" "UNKNOWN_CIPHER_ALGORITHM\0" "UNKNOWN_DIGEST\0" - "UNKNOWN_HASH\0" + "UNSUPPORTED_KEYLENGTH\0" + "UNSUPPORTED_KEY_DERIVATION_FUNCTION\0" + "UNSUPPORTED_PRF\0" "UNSUPPORTED_PRIVATE_KEY_ALGORITHM\0" + "UNSUPPORTED_SALT_TYPE\0" "BAD_E_VALUE\0" "BAD_FIXED_HEADER_DECRYPT\0" "BAD_PAD_BYTE_COUNT\0" @@ -969,6 +1050,8 @@ const char kOpenSSLReasonStringData[] = "UNKNOWN_PADDING_TYPE\0" "VALUE_MISSING\0" "WRONG_SIGNATURE_LENGTH\0" + "ALPN_MISMATCH_ON_EARLY_DATA\0" + "APPLICATION_DATA_INSTEAD_OF_HANDSHAKE\0" "APP_DATA_IN_HANDSHAKE\0" "ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT\0" "BAD_ALERT\0" @@ -988,9 +1071,14 @@ const char kOpenSSLReasonStringData[] = "BAD_SSL_FILETYPE\0" "BAD_WRITE_RETRY\0" "BIO_NOT_SET\0" + "BLOCK_CIPHER_PAD_IS_WRONG\0" + "BUFFERED_MESSAGES_ON_CIPHER_CHANGE\0" + "CANNOT_HAVE_BOTH_PRIVKEY_AND_METHOD\0" + "CANNOT_PARSE_LEAF_CERT\0" "CA_DN_LENGTH_MISMATCH\0" "CA_DN_TOO_LONG\0" "CCS_RECEIVED_EARLY\0" + "CERTIFICATE_AND_PRIVATE_KEY_MISMATCH\0" "CERTIFICATE_VERIFY_FAILED\0" "CERT_CB_ERROR\0" "CERT_LENGTH_MISMATCH\0" @@ -1008,8 +1096,12 @@ const char kOpenSSLReasonStringData[] = "DH_PUBLIC_VALUE_LENGTH_IS_WRONG\0" "DH_P_TOO_LONG\0" "DIGEST_CHECK_FAILED\0" + "DOWNGRADE_DETECTED\0" "DTLS_MESSAGE_TOO_BIG\0" + "DUPLICATE_EXTENSION\0" + "DUPLICATE_KEY_SHARE\0" "ECC_CERT_NOT_FOR_SIGNING\0" + "EMPTY_HELLO_RETRY_REQUEST\0" "EMS_STATE_INCONSISTENT\0" "ENCRYPTED_LENGTH_TOO_LONG\0" "ERROR_ADDING_EXTENSION\0" @@ -1023,13 +1115,17 @@ const char kOpenSSLReasonStringData[] = "HTTPS_PROXY_REQUEST\0" "HTTP_REQUEST\0" "INAPPROPRIATE_FALLBACK\0" + "INVALID_ALPN_PROTOCOL\0" "INVALID_COMMAND\0" + "INVALID_COMPRESSION_LIST\0" "INVALID_MESSAGE\0" + "INVALID_OUTER_RECORD_TYPE\0" + "INVALID_SCT_LIST\0" "INVALID_SSL_SESSION\0" "INVALID_TICKET_KEYS_LENGTH\0" "LENGTH_MISMATCH\0" - "LIBRARY_HAS_NO_CIPHERS\0" "MISSING_EXTENSION\0" + "MISSING_KEY_SHARE\0" "MISSING_RSA_CERTIFICATE\0" "MISSING_TMP_DH_KEY\0" "MISSING_TMP_ECDH_KEY\0" @@ -1042,29 +1138,38 @@ const char kOpenSSLReasonStringData[] = "NO_CERTIFICATE_SET\0" "NO_CIPHERS_AVAILABLE\0" "NO_CIPHERS_PASSED\0" + "NO_CIPHERS_SPECIFIED\0" "NO_CIPHER_MATCH\0" + "NO_COMMON_SIGNATURE_ALGORITHMS\0" "NO_COMPRESSION_SPECIFIED\0" + "NO_GROUPS_SPECIFIED\0" "NO_METHOD_SPECIFIED\0" "NO_P256_SUPPORT\0" "NO_PRIVATE_KEY_ASSIGNED\0" "NO_RENEGOTIATION\0" "NO_REQUIRED_DIGEST\0" "NO_SHARED_CIPHER\0" + "NO_SHARED_GROUP\0" + "NO_SUPPORTED_VERSIONS_ENABLED\0" "NULL_SSL_CTX\0" "NULL_SSL_METHOD_PASSED\0" "OLD_SESSION_CIPHER_NOT_RETURNED\0" + "OLD_SESSION_PRF_HASH_MISMATCH\0" "OLD_SESSION_VERSION_NOT_RETURNED\0" "PARSE_TLSEXT\0" "PATH_TOO_LONG\0" "PEER_DID_NOT_RETURN_A_CERTIFICATE\0" "PEER_ERROR_UNSUPPORTED_CERTIFICATE_TYPE\0" + "PRE_SHARED_KEY_MUST_BE_LAST\0" "PROTOCOL_IS_SHUTDOWN\0" + "PSK_IDENTITY_BINDER_COUNT_MISMATCH\0" "PSK_IDENTITY_NOT_FOUND\0" "PSK_NO_CLIENT_CB\0" "PSK_NO_SERVER_CB\0" "READ_TIMEOUT_EXPIRED\0" "RECORD_LENGTH_MISMATCH\0" "RECORD_TOO_LARGE\0" + "RENEGOTIATION_EMS_MISMATCH\0" "RENEGOTIATION_ENCODING_ERR\0" "RENEGOTIATION_MISMATCH\0" "REQUIRED_CIPHER_MISSING\0" @@ -1072,6 +1177,7 @@ const char kOpenSSLReasonStringData[] = "RESUMED_NON_EMS_SESSION_WITH_EMS_EXTENSION\0" "SCSV_RECEIVED_WHEN_RENEGOTIATING\0" "SERVERHELLO_TLSEXT\0" + "SERVER_CERT_CHANGED\0" "SESSION_ID_CONTEXT_UNINITIALIZED\0" "SESSION_MAY_NOT_BE_CREATED\0" "SHUTDOWN_WHILE_IN_INIT\0" @@ -1094,6 +1200,7 @@ const char kOpenSSLReasonStringData[] = "SSL_CTX_HAS_NO_DEFAULT_SSL_VERSION\0" "SSL_HANDSHAKE_FAILURE\0" "SSL_SESSION_ID_CONTEXT_TOO_LONG\0" + "TICKET_ENCRYPTION_FAILED\0" "TLSV1_ALERT_ACCESS_DENIED\0" "TLSV1_ALERT_DECODE_ERROR\0" "TLSV1_ALERT_DECRYPTION_FAILED\0" @@ -1109,15 +1216,21 @@ const char kOpenSSLReasonStringData[] = "TLSV1_ALERT_USER_CANCELLED\0" "TLSV1_BAD_CERTIFICATE_HASH_VALUE\0" "TLSV1_BAD_CERTIFICATE_STATUS_RESPONSE\0" + "TLSV1_CERTIFICATE_REQUIRED\0" "TLSV1_CERTIFICATE_UNOBTAINABLE\0" + "TLSV1_UNKNOWN_PSK_IDENTITY\0" "TLSV1_UNRECOGNIZED_NAME\0" "TLSV1_UNSUPPORTED_EXTENSION\0" "TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST\0" "TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG\0" "TOO_MANY_EMPTY_FRAGMENTS\0" + "TOO_MANY_KEY_UPDATES\0" "TOO_MANY_WARNING_ALERTS\0" + "TOO_MUCH_READ_EARLY_DATA\0" + "TOO_MUCH_SKIPPED_EARLY_DATA\0" "UNABLE_TO_FIND_ECDH_PARAMETERS\0" "UNEXPECTED_EXTENSION\0" + "UNEXPECTED_EXTENSION_ON_EARLY_DATA\0" "UNEXPECTED_MESSAGE\0" "UNEXPECTED_OPERATOR_IN_GROUP\0" "UNEXPECTED_RECORD\0" @@ -1133,6 +1246,7 @@ const char kOpenSSLReasonStringData[] = "UNSUPPORTED_COMPRESSION_ALGORITHM\0" "UNSUPPORTED_ELLIPTIC_CURVE\0" "UNSUPPORTED_PROTOCOL\0" + "UNSUPPORTED_PROTOCOL_FOR_CUSTOM_KEY\0" "WRONG_CERTIFICATE_TYPE\0" "WRONG_CIPHER_RETURNED\0" "WRONG_CURVE\0" @@ -1140,10 +1254,10 @@ const char kOpenSSLReasonStringData[] = "WRONG_SIGNATURE_TYPE\0" "WRONG_SSL_VERSION\0" "WRONG_VERSION_NUMBER\0" + "WRONG_VERSION_ON_EARLY_DATA\0" "X509_LIB\0" "X509_VERIFICATION_SETUP_PROBLEMS\0" "AKID_MISMATCH\0" - "BAD_PKCS7_VERSION\0" "BAD_X509_FILETYPE\0" "BASE64_DECODE_ERROR\0" "CANT_CHECK_DH_KEY\0" @@ -1153,6 +1267,7 @@ const char kOpenSSLReasonStringData[] = "IDP_MISMATCH\0" "INVALID_DIRECTORY\0" "INVALID_FIELD_NAME\0" + "INVALID_PARAMETER\0" "INVALID_PSS_PARAMETERS\0" "INVALID_TRUST\0" "ISSUER_MISMATCH\0" @@ -1162,10 +1277,7 @@ const char kOpenSSLReasonStringData[] = "LOADING_DEFAULTS\0" "NAME_TOO_LONG\0" "NEWER_CRL_NOT_NEWER\0" - "NOT_PKCS7_SIGNED_DATA\0" - "NO_CERTIFICATES_INCLUDED\0" "NO_CERT_SET_FOR_US_TO_VERIFY\0" - "NO_CRLS_INCLUDED\0" "NO_CRL_NUMBER\0" "PUBLIC_KEY_DECODE_ERROR\0" "PUBLIC_KEY_ENCODE_ERROR\0" @@ -1235,4 +1347,3 @@ const char kOpenSSLReasonStringData[] = "UNSUPPORTED_OPTION\0" "USER_TOO_LONG\0" ""; - diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/aes/aesv8-armx64.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S similarity index 93% rename from packager/third_party/boringssl/linux-aarch64/crypto/aes/aesv8-armx64.S rename to packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S index 3e8cb16e01..51e2464487 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/aes/aesv8-armx64.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S @@ -3,7 +3,7 @@ #if __ARM_MAX_ARCH__>=7 .text -#if !defined(__clang__) +#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH) .arch armv8-a+crypto #endif .align 5 @@ -12,11 +12,11 @@ .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b -.globl aes_v8_set_encrypt_key -.hidden aes_v8_set_encrypt_key -.type aes_v8_set_encrypt_key,%function +.globl aes_hw_set_encrypt_key +.hidden aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,%function .align 5 -aes_v8_set_encrypt_key: +aes_hw_set_encrypt_key: .Lenc_key: stp x29,x30,[sp,#-16]! add x29,sp,#0 @@ -178,13 +178,13 @@ aes_v8_set_encrypt_key: mov x0,x3 // return value ldr x29,[sp],#16 ret -.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key +.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key -.globl aes_v8_set_decrypt_key -.hidden aes_v8_set_decrypt_key -.type aes_v8_set_decrypt_key,%function +.globl aes_hw_set_decrypt_key +.hidden aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,%function .align 5 -aes_v8_set_decrypt_key: +aes_hw_set_decrypt_key: stp x29,x30,[sp,#-16]! add x29,sp,#0 bl .Lenc_key @@ -219,12 +219,12 @@ aes_v8_set_decrypt_key: .Ldec_key_abort: ldp x29,x30,[sp],#16 ret -.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key -.globl aes_v8_encrypt -.hidden aes_v8_encrypt -.type aes_v8_encrypt,%function +.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,%function .align 5 -aes_v8_encrypt: +aes_hw_encrypt: ldr w3,[x2,#240] ld1 {v0.4s},[x2],#16 ld1 {v2.16b},[x0] @@ -249,12 +249,12 @@ aes_v8_encrypt: st1 {v2.16b},[x1] ret -.size aes_v8_encrypt,.-aes_v8_encrypt -.globl aes_v8_decrypt -.hidden aes_v8_decrypt -.type aes_v8_decrypt,%function +.size aes_hw_encrypt,.-aes_hw_encrypt +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,%function .align 5 -aes_v8_decrypt: +aes_hw_decrypt: ldr w3,[x2,#240] ld1 {v0.4s},[x2],#16 ld1 {v2.16b},[x0] @@ -279,12 +279,12 @@ aes_v8_decrypt: st1 {v2.16b},[x1] ret -.size aes_v8_decrypt,.-aes_v8_decrypt -.globl aes_v8_cbc_encrypt -.hidden aes_v8_cbc_encrypt -.type aes_v8_cbc_encrypt,%function +.size aes_hw_decrypt,.-aes_hw_decrypt +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,%function .align 5 -aes_v8_cbc_encrypt: +aes_hw_cbc_encrypt: stp x29,x30,[sp,#-16]! add x29,sp,#0 subs x2,x2,#16 @@ -570,12 +570,12 @@ aes_v8_cbc_encrypt: .Lcbc_abort: ldr x29,[sp],#16 ret -.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt -.globl aes_v8_ctr32_encrypt_blocks -.hidden aes_v8_ctr32_encrypt_blocks -.type aes_v8_ctr32_encrypt_blocks,%function +.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,%function .align 5 -aes_v8_ctr32_encrypt_blocks: +aes_hw_ctr32_encrypt_blocks: stp x29,x30,[sp,#-16]! add x29,sp,#0 ldr w5,[x3,#240] @@ -752,6 +752,6 @@ aes_v8_ctr32_encrypt_blocks: .Lctr32_done: ldr x29,[sp],#16 ret -.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks #endif #endif diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/bn/armv8-mont.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S similarity index 100% rename from packager/third_party/boringssl/linux-aarch64/crypto/bn/armv8-mont.S rename to packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/modes/ghashv8-armx64.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S similarity index 98% rename from packager/third_party/boringssl/linux-aarch64/crypto/modes/ghashv8-armx64.S rename to packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S index f39f3ba870..89d780ff69 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/modes/ghashv8-armx64.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S @@ -2,7 +2,7 @@ #include .text -#if !defined(__clang__) +#if !defined(__clang__) || defined(BORINGSSL_CLANG_SUPPORTS_DOT_ARCH) .arch armv8-a+crypto #endif .globl gcm_init_v8 diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha1-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S similarity index 99% rename from packager/third_party/boringssl/linux-aarch64/crypto/sha/sha1-armv8.S rename to packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S index cfb4aa021f..ff361f454a 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha1-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S @@ -9,7 +9,11 @@ .type sha1_block_data_order,%function .align 6 sha1_block_data_order: +#ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +#else ldr x16,.LOPENSSL_armcap_P +#endif adr x17,.LOPENSSL_armcap_P add x16,x16,x17 ldr w16,[x16] @@ -1208,7 +1212,11 @@ sha1_block_armv8: .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 .LOPENSSL_armcap_P: +#ifdef __ILP32__ +.long OPENSSL_armcap_P-. +#else .quad OPENSSL_armcap_P-. +#endif .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha256-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S similarity index 94% rename from packager/third_party/boringssl/linux-aarch64/crypto/sha/sha256-armv8.S rename to packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S index bfc552cbf1..19db33937e 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha256-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S @@ -1,5 +1,46 @@ #if defined(__aarch64__) -#include +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significanty faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +# include +#endif .text @@ -9,12 +50,18 @@ .type sha256_block_data_order,%function .align 6 sha256_block_data_order: +#ifndef __KERNEL__ +# ifdef __ILP32__ + ldrsw x16,.LOPENSSL_armcap_P +# else ldr x16,.LOPENSSL_armcap_P +# endif adr x17,.LOPENSSL_armcap_P add x16,x16,x17 ldr w16,[x16] tst w16,#ARMV8_SHA256 b.ne .Lv8_entry +#endif stp x29,x30,[sp,#-128]! add x29,sp,#0 @@ -998,12 +1045,19 @@ sha256_block_data_order: .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0 //terminator .size .LK256,.-.LK256 +#ifndef __KERNEL__ .align 3 .LOPENSSL_armcap_P: +# ifdef __ILP32__ +.long OPENSSL_armcap_P-. +# else .quad OPENSSL_armcap_P-. +# endif +#endif .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 +#ifndef __KERNEL__ .type sha256_block_armv8,%function .align 6 sha256_block_armv8: @@ -1142,5 +1196,8 @@ sha256_block_armv8: ldr x29,[sp],#16 ret .size sha256_block_armv8,.-sha256_block_armv8 +#endif +#ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 #endif +#endif diff --git a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha512-armv8.S b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S similarity index 93% rename from packager/third_party/boringssl/linux-aarch64/crypto/sha/sha512-armv8.S rename to packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S index 4645722923..bb052b7551 100644 --- a/packager/third_party/boringssl/linux-aarch64/crypto/sha/sha512-armv8.S +++ b/packager/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S @@ -1,5 +1,46 @@ #if defined(__aarch64__) -#include +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the OpenSSL license (the "License"). You may not use +// this file except in compliance with the License. You can obtain a copy +// in the file LICENSE in the source distribution or at +// https://www.openssl.org/source/license.html + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. The module is, however, dual licensed under OpenSSL and +// CRYPTOGAMS licenses depending on where you obtain it. For further +// details see http://www.openssl.org/~appro/cryptogams/. +// +// Permission to use under GPLv2 terms is granted. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significanty faster +// and the gap is only 40-90%. + +#ifndef __KERNEL__ +# include +#endif .text @@ -1016,11 +1057,19 @@ sha512_block_data_order: .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0 // terminator .size .LK512,.-.LK512 +#ifndef __KERNEL__ .align 3 .LOPENSSL_armcap_P: +# ifdef __ILP32__ +.long OPENSSL_armcap_P-. +# else .quad OPENSSL_armcap_P-. +# endif +#endif .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 +#ifndef __KERNEL__ .comm OPENSSL_armcap_P,4,4 #endif +#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/bn/armv4-mont.S b/packager/third_party/boringssl/linux-arm/crypto/bn/armv4-mont.S deleted file mode 100644 index e59599f832..0000000000 --- a/packager/third_party/boringssl/linux-arm/crypto/bn/armv4-mont.S +++ /dev/null @@ -1,589 +0,0 @@ -#if defined(__arm__) -#include - -.text -.code 32 - -#if __ARM_MAX_ARCH__>=7 -.align 5 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.Lbn_mul_mont -#endif - -.globl bn_mul_mont -.hidden bn_mul_mont -.type bn_mul_mont,%function - -.align 5 -bn_mul_mont: -.Lbn_mul_mont: - ldr ip,[sp,#4] @ load num - stmdb sp!,{r0,r2} @ sp points at argument block -#if __ARM_MAX_ARCH__>=7 - tst ip,#7 - bne .Lialu - adr r0,bn_mul_mont - ldr r2,.LOPENSSL_armcap - ldr r0,[r0,r2] -#ifdef __APPLE__ - ldr r0,[r0] -#endif - tst r0,#ARMV7_NEON @ NEON available? - ldmia sp, {r0,r2} - beq .Lialu - add sp,sp,#8 - b bn_mul8x_mont_neon -.align 4 -.Lialu: -#endif - cmp ip,#2 - mov r0,ip @ load num - movlt r0,#0 - addlt sp,sp,#2*4 - blt .Labrt - - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers - - mov r0,r0,lsl#2 @ rescale r0 for byte count - sub sp,sp,r0 @ alloca(4*num) - sub sp,sp,#4 @ +extra dword - sub r0,r0,#4 @ "num=num-1" - add r4,r2,r0 @ &bp[num-1] - - add r0,sp,r0 @ r0 to point at &tp[num-1] - ldr r8,[r0,#14*4] @ &n0 - ldr r2,[r2] @ bp[0] - ldr r5,[r1],#4 @ ap[0],ap++ - ldr r6,[r3],#4 @ np[0],np++ - ldr r8,[r8] @ *n0 - str r4,[r0,#15*4] @ save &bp[num] - - umull r10,r11,r5,r2 @ ap[0]*bp[0] - str r8,[r0,#14*4] @ save n0 value - mul r8,r10,r8 @ "tp[0]"*n0 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" - mov r4,sp - -.L1st: - ldr r5,[r1],#4 @ ap[j],ap++ - mov r10,r11 - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[0] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne .L1st - - adds r12,r12,r11 - ldr r4,[r0,#13*4] @ restore bp - mov r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - str r14,[r0,#4] @ tp[num]= - -.Louter: - sub r7,r0,sp @ "original" r0-1 value - sub r1,r1,r7 @ "rewind" ap to &ap[1] - ldr r2,[r4,#4]! @ *(++bp) - sub r3,r3,r7 @ "rewind" np to &np[1] - ldr r5,[r1,#-4] @ ap[0] - ldr r10,[sp] @ tp[0] - ldr r6,[r3,#-4] @ np[0] - ldr r7,[sp,#4] @ tp[1] - - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] - str r4,[r0,#13*4] @ save bp - mul r8,r10,r8 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" - mov r4,sp - -.Linner: - ldr r5,[r1],#4 @ ap[j],ap++ - adds r10,r11,r7 @ +=tp[j] - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[i] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adc r11,r11,#0 - ldr r7,[r4,#8] @ tp[j+1] - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne .Linner - - adds r12,r12,r11 - mov r14,#0 - ldr r4,[r0,#13*4] @ restore bp - adc r14,r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adds r12,r12,r7 - ldr r7,[r0,#15*4] @ restore &bp[num] - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - str r14,[r0,#4] @ tp[num]= - - cmp r4,r7 - bne .Louter - - ldr r2,[r0,#12*4] @ pull rp - add r0,r0,#4 @ r0 to point at &tp[num] - sub r5,r0,sp @ "original" num value - mov r4,sp @ "rewind" r4 - mov r1,r4 @ "borrow" r1 - sub r3,r3,r5 @ "rewind" r3 to &np[0] - - subs r7,r7,r7 @ "clear" carry flag -.Lsub: ldr r7,[r4],#4 - ldr r6,[r3],#4 - sbcs r7,r7,r6 @ tp[j]-np[j] - str r7,[r2],#4 @ rp[j]= - teq r4,r0 @ preserve carry - bne .Lsub - sbcs r14,r14,#0 @ upmost carry - mov r4,sp @ "rewind" r4 - sub r2,r2,r5 @ "rewind" r2 - - and r1,r4,r14 - bic r3,r2,r14 - orr r1,r1,r3 @ ap=borrow?tp:rp - -.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh - str sp,[r4],#4 @ zap tp - str r7,[r2],#4 - cmp r4,r0 - bne .Lcopy - - add sp,r0,#4 @ skip over tp[num+1] - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers - add sp,sp,#2*4 @ skip over {r0,r2} - mov r0,#1 -.Labrt: -#if __ARM_ARCH__>=5 - bx lr @ .word 0xe12fff1e -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size bn_mul_mont,.-bn_mul_mont -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.type bn_mul8x_mont_neon,%function -.align 5 -bn_mul8x_mont_neon: - mov ip,sp - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - ldmia ip,{r4,r5} @ load rest of parameter block - - sub r7,sp,#16 - vld1.32 {d28[0]}, [r2,:32]! - sub r7,r7,r5,lsl#4 - vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( - and r7,r7,#-64 - vld1.32 {d30[0]}, [r4,:32] - mov sp,r7 @ alloca - veor d8,d8,d8 - subs r8,r5,#8 - vzip.16 d28,d8 - - vmull.u32 q6,d28,d0[0] - vmull.u32 q7,d28,d0[1] - vmull.u32 q8,d28,d1[0] - vshl.i64 d10,d13,#16 - vmull.u32 q9,d28,d1[1] - - vadd.u64 d10,d10,d12 - veor d8,d8,d8 - vmul.u32 d29,d10,d30 - - vmull.u32 q10,d28,d2[0] - vld1.32 {d4,d5,d6,d7}, [r3]! - vmull.u32 q11,d28,d2[1] - vmull.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmull.u32 q13,d28,d3[1] - - bne .LNEON_1st - - @ special case for num=8, everything is in register bank... - - vmlal.u32 q6,d29,d4[0] - sub r9,r5,#1 - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vmov q5,q6 - vmlal.u32 q11,d29,d6[1] - vmov q6,q7 - vmlal.u32 q12,d29,d7[0] - vmov q7,q8 - vmlal.u32 q13,d29,d7[1] - vmov q8,q9 - vmov q9,q10 - vshr.u64 d10,d10,#16 - vmov q10,q11 - vmov q11,q12 - vadd.u64 d10,d10,d11 - vmov q12,q13 - veor q13,q13 - vshr.u64 d10,d10,#16 - - b .LNEON_outer8 - -.align 4 -.LNEON_outer8: - vld1.32 {d28[0]}, [r2,:32]! - veor d8,d8,d8 - vzip.16 d28,d8 - vadd.u64 d12,d12,d10 - - vmlal.u32 q6,d28,d0[0] - vmlal.u32 q7,d28,d0[1] - vmlal.u32 q8,d28,d1[0] - vshl.i64 d10,d13,#16 - vmlal.u32 q9,d28,d1[1] - - vadd.u64 d10,d10,d12 - veor d8,d8,d8 - subs r9,r9,#1 - vmul.u32 d29,d10,d30 - - vmlal.u32 q10,d28,d2[0] - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q13,d28,d3[1] - - vmlal.u32 q6,d29,d4[0] - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vmov q5,q6 - vmlal.u32 q11,d29,d6[1] - vmov q6,q7 - vmlal.u32 q12,d29,d7[0] - vmov q7,q8 - vmlal.u32 q13,d29,d7[1] - vmov q8,q9 - vmov q9,q10 - vshr.u64 d10,d10,#16 - vmov q10,q11 - vmov q11,q12 - vadd.u64 d10,d10,d11 - vmov q12,q13 - veor q13,q13 - vshr.u64 d10,d10,#16 - - bne .LNEON_outer8 - - vadd.u64 d12,d12,d10 - mov r7,sp - vshr.u64 d10,d12,#16 - mov r8,r5 - vadd.u64 d13,d13,d10 - add r6,sp,#16 - vshr.u64 d10,d13,#16 - vzip.16 d12,d13 - - b .LNEON_tail2 - -.align 4 -.LNEON_1st: - vmlal.u32 q6,d29,d4[0] - vld1.32 {d0,d1,d2,d3}, [r1]! - vmlal.u32 q7,d29,d4[1] - subs r8,r8,#8 - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vld1.32 {d4,d5}, [r3]! - vmlal.u32 q11,d29,d6[1] - vst1.64 {q6,q7}, [r7,:256]! - vmlal.u32 q12,d29,d7[0] - vmlal.u32 q13,d29,d7[1] - vst1.64 {q8,q9}, [r7,:256]! - - vmull.u32 q6,d28,d0[0] - vld1.32 {d6,d7}, [r3]! - vmull.u32 q7,d28,d0[1] - vst1.64 {q10,q11}, [r7,:256]! - vmull.u32 q8,d28,d1[0] - vmull.u32 q9,d28,d1[1] - vst1.64 {q12,q13}, [r7,:256]! - - vmull.u32 q10,d28,d2[0] - vmull.u32 q11,d28,d2[1] - vmull.u32 q12,d28,d3[0] - vmull.u32 q13,d28,d3[1] - - bne .LNEON_1st - - vmlal.u32 q6,d29,d4[0] - add r6,sp,#16 - vmlal.u32 q7,d29,d4[1] - sub r1,r1,r5,lsl#2 @ rewind r1 - vmlal.u32 q8,d29,d5[0] - vld1.64 {q5}, [sp,:128] - vmlal.u32 q9,d29,d5[1] - sub r9,r5,#1 - - vmlal.u32 q10,d29,d6[0] - vst1.64 {q6,q7}, [r7,:256]! - vmlal.u32 q11,d29,d6[1] - vshr.u64 d10,d10,#16 - vld1.64 {q6}, [r6, :128]! - vmlal.u32 q12,d29,d7[0] - vst1.64 {q8,q9}, [r7,:256]! - vmlal.u32 q13,d29,d7[1] - - vst1.64 {q10,q11}, [r7,:256]! - vadd.u64 d10,d10,d11 - veor q4,q4,q4 - vst1.64 {q12,q13}, [r7,:256]! - vld1.64 {q7,q8}, [r6, :256]! - vst1.64 {q4}, [r7,:128] - vshr.u64 d10,d10,#16 - - b .LNEON_outer - -.align 4 -.LNEON_outer: - vld1.32 {d28[0]}, [r2,:32]! - sub r3,r3,r5,lsl#2 @ rewind r3 - vld1.32 {d0,d1,d2,d3}, [r1]! - veor d8,d8,d8 - mov r7,sp - vzip.16 d28,d8 - sub r8,r5,#8 - vadd.u64 d12,d12,d10 - - vmlal.u32 q6,d28,d0[0] - vld1.64 {q9,q10},[r6,:256]! - vmlal.u32 q7,d28,d0[1] - vmlal.u32 q8,d28,d1[0] - vld1.64 {q11,q12},[r6,:256]! - vmlal.u32 q9,d28,d1[1] - - vshl.i64 d10,d13,#16 - veor d8,d8,d8 - vadd.u64 d10,d10,d12 - vld1.64 {q13},[r6,:128]! - vmul.u32 d29,d10,d30 - - vmlal.u32 q10,d28,d2[0] - vld1.32 {d4,d5,d6,d7}, [r3]! - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q13,d28,d3[1] - -.LNEON_inner: - vmlal.u32 q6,d29,d4[0] - vld1.32 {d0,d1,d2,d3}, [r1]! - vmlal.u32 q7,d29,d4[1] - subs r8,r8,#8 - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - vst1.64 {q6,q7}, [r7,:256]! - - vmlal.u32 q10,d29,d6[0] - vld1.64 {q6}, [r6, :128]! - vmlal.u32 q11,d29,d6[1] - vst1.64 {q8,q9}, [r7,:256]! - vmlal.u32 q12,d29,d7[0] - vld1.64 {q7,q8}, [r6, :256]! - vmlal.u32 q13,d29,d7[1] - vst1.64 {q10,q11}, [r7,:256]! - - vmlal.u32 q6,d28,d0[0] - vld1.64 {q9,q10}, [r6, :256]! - vmlal.u32 q7,d28,d0[1] - vst1.64 {q12,q13}, [r7,:256]! - vmlal.u32 q8,d28,d1[0] - vld1.64 {q11,q12}, [r6, :256]! - vmlal.u32 q9,d28,d1[1] - vld1.32 {d4,d5,d6,d7}, [r3]! - - vmlal.u32 q10,d28,d2[0] - vld1.64 {q13}, [r6, :128]! - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vmlal.u32 q13,d28,d3[1] - - bne .LNEON_inner - - vmlal.u32 q6,d29,d4[0] - add r6,sp,#16 - vmlal.u32 q7,d29,d4[1] - sub r1,r1,r5,lsl#2 @ rewind r1 - vmlal.u32 q8,d29,d5[0] - vld1.64 {q5}, [sp,:128] - vmlal.u32 q9,d29,d5[1] - subs r9,r9,#1 - - vmlal.u32 q10,d29,d6[0] - vst1.64 {q6,q7}, [r7,:256]! - vmlal.u32 q11,d29,d6[1] - vld1.64 {q6}, [r6, :128]! - vshr.u64 d10,d10,#16 - vst1.64 {q8,q9}, [r7,:256]! - vmlal.u32 q12,d29,d7[0] - vld1.64 {q7,q8}, [r6, :256]! - vmlal.u32 q13,d29,d7[1] - - vst1.64 {q10,q11}, [r7,:256]! - vadd.u64 d10,d10,d11 - vst1.64 {q12,q13}, [r7,:256]! - vshr.u64 d10,d10,#16 - - bne .LNEON_outer - - mov r7,sp - mov r8,r5 - -.LNEON_tail: - vadd.u64 d12,d12,d10 - vld1.64 {q9,q10}, [r6, :256]! - vshr.u64 d10,d12,#16 - vadd.u64 d13,d13,d10 - vld1.64 {q11,q12}, [r6, :256]! - vshr.u64 d10,d13,#16 - vld1.64 {q13}, [r6, :128]! - vzip.16 d12,d13 - -.LNEON_tail2: - vadd.u64 d14,d14,d10 - vst1.32 {d12[0]}, [r7, :32]! - vshr.u64 d10,d14,#16 - vadd.u64 d15,d15,d10 - vshr.u64 d10,d15,#16 - vzip.16 d14,d15 - - vadd.u64 d16,d16,d10 - vst1.32 {d14[0]}, [r7, :32]! - vshr.u64 d10,d16,#16 - vadd.u64 d17,d17,d10 - vshr.u64 d10,d17,#16 - vzip.16 d16,d17 - - vadd.u64 d18,d18,d10 - vst1.32 {d16[0]}, [r7, :32]! - vshr.u64 d10,d18,#16 - vadd.u64 d19,d19,d10 - vshr.u64 d10,d19,#16 - vzip.16 d18,d19 - - vadd.u64 d20,d20,d10 - vst1.32 {d18[0]}, [r7, :32]! - vshr.u64 d10,d20,#16 - vadd.u64 d21,d21,d10 - vshr.u64 d10,d21,#16 - vzip.16 d20,d21 - - vadd.u64 d22,d22,d10 - vst1.32 {d20[0]}, [r7, :32]! - vshr.u64 d10,d22,#16 - vadd.u64 d23,d23,d10 - vshr.u64 d10,d23,#16 - vzip.16 d22,d23 - - vadd.u64 d24,d24,d10 - vst1.32 {d22[0]}, [r7, :32]! - vshr.u64 d10,d24,#16 - vadd.u64 d25,d25,d10 - vld1.64 {q6}, [r6, :128]! - vshr.u64 d10,d25,#16 - vzip.16 d24,d25 - - vadd.u64 d26,d26,d10 - vst1.32 {d24[0]}, [r7, :32]! - vshr.u64 d10,d26,#16 - vadd.u64 d27,d27,d10 - vld1.64 {q7,q8}, [r6, :256]! - vshr.u64 d10,d27,#16 - vzip.16 d26,d27 - subs r8,r8,#8 - vst1.32 {d26[0]}, [r7, :32]! - - bne .LNEON_tail - - vst1.32 {d10[0]}, [r7, :32] @ top-most bit - sub r3,r3,r5,lsl#2 @ rewind r3 - subs r1,sp,#0 @ clear carry flag - add r2,sp,r5,lsl#2 - -.LNEON_sub: - ldmia r1!, {r4,r5,r6,r7} - ldmia r3!, {r8,r9,r10,r11} - sbcs r8, r4,r8 - sbcs r9, r5,r9 - sbcs r10,r6,r10 - sbcs r11,r7,r11 - teq r1,r2 @ preserves carry - stmia r0!, {r8,r9,r10,r11} - bne .LNEON_sub - - ldr r10, [r1] @ load top-most bit - veor q0,q0,q0 - sub r11,r2,sp @ this is num*4 - veor q1,q1,q1 - mov r1,sp - sub r0,r0,r11 @ rewind r0 - mov r3,r2 @ second 3/4th of frame - sbcs r10,r10,#0 @ result is carry flag - -.LNEON_copy_n_zap: - ldmia r1!, {r4,r5,r6,r7} - ldmia r0, {r8,r9,r10,r11} - movcc r8, r4 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - movcc r9, r5 - movcc r10,r6 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - movcc r11,r7 - ldmia r1, {r4,r5,r6,r7} - stmia r0!, {r8,r9,r10,r11} - sub r1,r1,#16 - ldmia r0, {r8,r9,r10,r11} - movcc r8, r4 - vst1.64 {q0,q1}, [r1,:256]! @ wipe - movcc r9, r5 - movcc r10,r6 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - movcc r11,r7 - teq r1,r2 @ preserves carry - stmia r0!, {r8,r9,r10,r11} - bne .LNEON_copy_n_zap - - sub sp,ip,#96 - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} - bx lr @ .word 0xe12fff1e -.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon -#endif -.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#if __ARM_MAX_ARCH__>=7 -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S index 19a4d2c4ff..6c947734fe 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S @@ -2,8 +2,10 @@ #include .text -#if defined(__thumb2__) +#if defined(__thumb2__) || defined(__clang__) .syntax unified +#endif +#if defined(__thumb2__) .thumb #else .code 32 @@ -1457,7 +1459,7 @@ ChaCha20_neon: ldrb r9,[r12],#1 @ read input subs r11,r11,#1 eor r8,r8,r9 - strb r8,[r14],#1 @ store ouput + strb r8,[r14],#1 @ store output bne .Loop_tail_neon .Ldone_neon: diff --git a/packager/third_party/boringssl/linux-arm/crypto/aes/aes-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S similarity index 98% rename from packager/third_party/boringssl/linux-arm/crypto/aes/aes-armv4.S rename to packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S index bc11e3f139..d401fc78f1 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/aes/aes-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aes-armv4.S @@ -1,4 +1,11 @@ #if defined(__arm__) +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @@ -32,7 +39,6 @@ @ Profiler-assisted and platform-specific optimization resulted in 16% @ improvement on Cortex A8 core and ~21.5 cycles per byte. -#if defined(__arm__) #ifndef __KERNEL__ # include #else @@ -40,15 +46,12 @@ #endif .text -#if __ARM_ARCH__<7 -.code 32 -#else +#if defined(__thumb2__) && !defined(__APPLE__) .syntax unified -# if defined(__thumb2__) && !defined(__APPLE__) .thumb -# else +#else .code 32 -# endif +#undef __thumb2__ #endif .type AES_Te,%object @@ -164,10 +167,10 @@ AES_Te: .type asm_AES_encrypt,%function .align 5 asm_AES_encrypt: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ asm_AES_encrypt #else - adr r3,asm_AES_encrypt + adr r3,. #endif stmdb sp!,{r1,r4-r12,lr} #ifdef __APPLE__ @@ -415,19 +418,19 @@ _armv4_AES_encrypt: .align 5 asm_AES_set_encrypt_key: _armv4_AES_set_encrypt_key: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ asm_AES_set_encrypt_key #else - adr r3,asm_AES_set_encrypt_key + adr r3,. #endif teq r0,#0 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif moveq r0,#-1 beq .Labrt teq r2,#0 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif moveq r0,#-1 @@ -438,7 +441,7 @@ _armv4_AES_set_encrypt_key: teq r1,#192 beq .Lok teq r1,#256 -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt ne @ Thumb2 thing, sanity check in ARM #endif movne r0,#-1 @@ -599,7 +602,7 @@ _armv4_AES_set_encrypt_key: str r2,[r11,#-16] subs r12,r12,#1 str r3,[r11,#-12] -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif subeq r2,r11,#216 @@ -671,7 +674,7 @@ _armv4_AES_set_encrypt_key: str r2,[r11,#-24] subs r12,r12,#1 str r3,[r11,#-20] -#if __ARM_ARCH__>=7 +#ifdef __thumb2__ itt eq @ Thumb2 thing, sanity check in ARM #endif subeq r2,r11,#256 @@ -744,7 +747,7 @@ _armv4_AES_set_enc2dec_key: ldr r12,[r0,#240] mov r7,r0 @ input add r8,r0,r12,lsl#4 - mov r11,r1 @ ouput + mov r11,r1 @ output add r10,r1,r12,lsl#4 str r12,[r1,#240] @@ -939,10 +942,10 @@ AES_Td: .type asm_AES_decrypt,%function .align 5 asm_AES_decrypt: -#if __ARM_ARCH__<7 +#ifndef __thumb2__ sub r3,pc,#8 @ asm_AES_decrypt #else - adr r3,asm_AES_decrypt + adr r3,. #endif stmdb sp!,{r1,r4-r12,lr} #ifdef __APPLE__ @@ -1195,6 +1198,4 @@ _armv4_AES_decrypt: .byte 65,69,83,32,102,111,114,32,65,82,77,118,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 .align 2 - -#endif #endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/aes/aesv8-armx32.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S similarity index 94% rename from packager/third_party/boringssl/linux-arm/crypto/aes/aesv8-armx32.S rename to packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S index 95a2ea4dc9..7c7ef19c79 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/aes/aesv8-armx32.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S @@ -3,20 +3,21 @@ #if __ARM_MAX_ARCH__>=7 .text -.arch armv7-a +.arch armv7-a @ don't confuse not-so-latest binutils with argv8 :-) .fpu neon .code 32 +#undef __thumb2__ .align 5 .Lrcon: .long 0x01,0x01,0x01,0x01 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat .long 0x1b,0x1b,0x1b,0x1b -.globl aes_v8_set_encrypt_key -.hidden aes_v8_set_encrypt_key -.type aes_v8_set_encrypt_key,%function +.globl aes_hw_set_encrypt_key +.hidden aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,%function .align 5 -aes_v8_set_encrypt_key: +aes_hw_set_encrypt_key: .Lenc_key: mov r3,#-1 cmp r0,#0 @@ -181,13 +182,13 @@ aes_v8_set_encrypt_key: mov r0,r3 @ return value bx lr -.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key +.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key -.globl aes_v8_set_decrypt_key -.hidden aes_v8_set_decrypt_key -.type aes_v8_set_decrypt_key,%function +.globl aes_hw_set_decrypt_key +.hidden aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,%function .align 5 -aes_v8_set_decrypt_key: +aes_hw_set_decrypt_key: stmdb sp!,{r4,lr} bl .Lenc_key @@ -220,12 +221,12 @@ aes_v8_set_decrypt_key: eor r0,r0,r0 @ return value .Ldec_key_abort: ldmia sp!,{r4,pc} -.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key -.globl aes_v8_encrypt -.hidden aes_v8_encrypt -.type aes_v8_encrypt,%function +.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,%function .align 5 -aes_v8_encrypt: +aes_hw_encrypt: ldr r3,[r2,#240] vld1.32 {q0},[r2]! vld1.8 {q2},[r0] @@ -250,12 +251,12 @@ aes_v8_encrypt: vst1.8 {q2},[r1] bx lr -.size aes_v8_encrypt,.-aes_v8_encrypt -.globl aes_v8_decrypt -.hidden aes_v8_decrypt -.type aes_v8_decrypt,%function +.size aes_hw_encrypt,.-aes_hw_encrypt +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,%function .align 5 -aes_v8_decrypt: +aes_hw_decrypt: ldr r3,[r2,#240] vld1.32 {q0},[r2]! vld1.8 {q2},[r0] @@ -280,12 +281,12 @@ aes_v8_decrypt: vst1.8 {q2},[r1] bx lr -.size aes_v8_decrypt,.-aes_v8_decrypt -.globl aes_v8_cbc_encrypt -.hidden aes_v8_cbc_encrypt -.type aes_v8_cbc_encrypt,%function +.size aes_hw_decrypt,.-aes_hw_decrypt +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,%function .align 5 -aes_v8_cbc_encrypt: +aes_hw_cbc_encrypt: mov ip,sp stmdb sp!,{r4,r5,r6,r7,r8,lr} vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so @@ -573,12 +574,12 @@ aes_v8_cbc_encrypt: .Lcbc_abort: vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} ldmia sp!,{r4,r5,r6,r7,r8,pc} -.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt -.globl aes_v8_ctr32_encrypt_blocks -.hidden aes_v8_ctr32_encrypt_blocks -.type aes_v8_ctr32_encrypt_blocks,%function +.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,%function .align 5 -aes_v8_ctr32_encrypt_blocks: +aes_hw_ctr32_encrypt_blocks: mov ip,sp stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr} vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so @@ -757,6 +758,6 @@ aes_v8_ctr32_encrypt_blocks: .Lctr32_done: vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} -.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks #endif #endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S new file mode 100644 index 0000000000..e77a9ea613 --- /dev/null +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S @@ -0,0 +1,956 @@ +#if defined(__arm__) +#include + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +#if __ARM_MAX_ARCH__>=7 +.align 5 +.LOPENSSL_armcap: +.word OPENSSL_armcap_P-.Lbn_mul_mont +#endif + +.globl bn_mul_mont +.hidden bn_mul_mont +.type bn_mul_mont,%function + +.align 5 +bn_mul_mont: +.Lbn_mul_mont: + ldr ip,[sp,#4] @ load num + stmdb sp!,{r0,r2} @ sp points at argument block +#if __ARM_MAX_ARCH__>=7 + tst ip,#7 + bne .Lialu + adr r0,.Lbn_mul_mont + ldr r2,.LOPENSSL_armcap + ldr r0,[r0,r2] +#ifdef __APPLE__ + ldr r0,[r0] +#endif + tst r0,#ARMV7_NEON @ NEON available? + ldmia sp, {r0,r2} + beq .Lialu + add sp,sp,#8 + b bn_mul8x_mont_neon +.align 4 +.Lialu: +#endif + cmp ip,#2 + mov r0,ip @ load num +#ifdef __thumb2__ + ittt lt +#endif + movlt r0,#0 + addlt sp,sp,#2*4 + blt .Labrt + + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers + + mov r0,r0,lsl#2 @ rescale r0 for byte count + sub sp,sp,r0 @ alloca(4*num) + sub sp,sp,#4 @ +extra dword + sub r0,r0,#4 @ "num=num-1" + add r4,r2,r0 @ &bp[num-1] + + add r0,sp,r0 @ r0 to point at &tp[num-1] + ldr r8,[r0,#14*4] @ &n0 + ldr r2,[r2] @ bp[0] + ldr r5,[r1],#4 @ ap[0],ap++ + ldr r6,[r3],#4 @ np[0],np++ + ldr r8,[r8] @ *n0 + str r4,[r0,#15*4] @ save &bp[num] + + umull r10,r11,r5,r2 @ ap[0]*bp[0] + str r8,[r0,#14*4] @ save n0 value + mul r8,r10,r8 @ "tp[0]"*n0 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" + mov r4,sp + +.L1st: + ldr r5,[r1],#4 @ ap[j],ap++ + mov r10,r11 + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[0] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne .L1st + + adds r12,r12,r11 + ldr r4,[r0,#13*4] @ restore bp + mov r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + mov r7,sp + str r14,[r0,#4] @ tp[num]= + +.Louter: + sub r7,r0,r7 @ "original" r0-1 value + sub r1,r1,r7 @ "rewind" ap to &ap[1] + ldr r2,[r4,#4]! @ *(++bp) + sub r3,r3,r7 @ "rewind" np to &np[1] + ldr r5,[r1,#-4] @ ap[0] + ldr r10,[sp] @ tp[0] + ldr r6,[r3,#-4] @ np[0] + ldr r7,[sp,#4] @ tp[1] + + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] + str r4,[r0,#13*4] @ save bp + mul r8,r10,r8 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" + mov r4,sp + +.Linner: + ldr r5,[r1],#4 @ ap[j],ap++ + adds r10,r11,r7 @ +=tp[j] + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[i] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adc r11,r11,#0 + ldr r7,[r4,#8] @ tp[j+1] + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne .Linner + + adds r12,r12,r11 + mov r14,#0 + ldr r4,[r0,#13*4] @ restore bp + adc r14,r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adds r12,r12,r7 + ldr r7,[r0,#15*4] @ restore &bp[num] + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + str r14,[r0,#4] @ tp[num]= + + cmp r4,r7 +#ifdef __thumb2__ + itt ne +#endif + movne r7,sp + bne .Louter + + ldr r2,[r0,#12*4] @ pull rp + mov r5,sp + add r0,r0,#4 @ r0 to point at &tp[num] + sub r5,r0,r5 @ "original" num value + mov r4,sp @ "rewind" r4 + mov r1,r4 @ "borrow" r1 + sub r3,r3,r5 @ "rewind" r3 to &np[0] + + subs r7,r7,r7 @ "clear" carry flag +.Lsub: ldr r7,[r4],#4 + ldr r6,[r3],#4 + sbcs r7,r7,r6 @ tp[j]-np[j] + str r7,[r2],#4 @ rp[j]= + teq r4,r0 @ preserve carry + bne .Lsub + sbcs r14,r14,#0 @ upmost carry + mov r4,sp @ "rewind" r4 + sub r2,r2,r5 @ "rewind" r2 + + and r1,r4,r14 + bic r3,r2,r14 + orr r1,r1,r3 @ ap=borrow?tp:rp + +.Lcopy: ldr r7,[r1],#4 @ copy or in-place refresh + str sp,[r4],#4 @ zap tp + str r7,[r2],#4 + cmp r4,r0 + bne .Lcopy + + mov sp,r0 + add sp,sp,#4 @ skip over tp[num+1] + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + mov r0,#1 +.Labrt: +#if __ARM_ARCH__>=5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size bn_mul_mont,.-bn_mul_mont +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldmia ip,{r4,r5} @ load rest of parameter block + mov ip,sp + + cmp r5,#8 + bhi .LNEON_8n + + @ special case for r5==8, everything is in register bank... + + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + sub r7,sp,r5,lsl#4 + vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( + and r7,r7,#-64 + vld1.32 {d30[0]}, [r4,:32] + mov sp,r7 @ alloca + vzip.16 d28,d8 + + vmull.u32 q6,d28,d0[0] + vmull.u32 q7,d28,d0[1] + vmull.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmull.u32 q9,d28,d1[1] + + vadd.u64 d29,d29,d12 + veor d8,d8,d8 + vmul.u32 d29,d29,d30 + + vmull.u32 q10,d28,d2[0] + vld1.32 {d4,d5,d6,d7}, [r3]! + vmull.u32 q11,d28,d2[1] + vmull.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmull.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + sub r9,r5,#1 + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + b .LNEON_outer8 + +.align 4 +.LNEON_outer8: + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + vzip.16 d28,d8 + vadd.u64 d12,d12,d10 + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + vmlal.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmlal.u32 q9,d28,d1[1] + + vadd.u64 d29,d29,d12 + veor d8,d8,d8 + subs r9,r9,#1 + vmul.u32 d29,d29,d30 + + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + bne .LNEON_outer8 + + vadd.u64 d12,d12,d10 + mov r7,sp + vshr.u64 d10,d12,#16 + mov r8,r5 + vadd.u64 d13,d13,d10 + add r6,sp,#96 + vshr.u64 d10,d13,#16 + vzip.16 d12,d13 + + b .LNEON_tail_entry + +.align 4 +.LNEON_8n: + veor q6,q6,q6 + sub r7,sp,#128 + veor q7,q7,q7 + sub r7,r7,r5,lsl#4 + veor q8,q8,q8 + and r7,r7,#-64 + veor q9,q9,q9 + mov sp,r7 @ alloca + veor q10,q10,q10 + add r7,r7,#256 + veor q11,q11,q11 + sub r8,r5,#8 + veor q12,q12,q12 + veor q13,q13,q13 + +.LNEON_8n_init: + vst1.64 {q6,q7},[r7,:256]! + subs r8,r8,#8 + vst1.64 {q8,q9},[r7,:256]! + vst1.64 {q10,q11},[r7,:256]! + vst1.64 {q12,q13},[r7,:256]! + bne .LNEON_8n_init + + add r6,sp,#256 + vld1.32 {d0,d1,d2,d3},[r1]! + add r10,sp,#8 + vld1.32 {d30[0]},[r4,:32] + mov r9,r5 + b .LNEON_8n_outer + +.align 4 +.LNEON_8n_outer: + vld1.32 {d28[0]},[r2,:32]! @ *b++ + veor d8,d8,d8 + vzip.16 d28,d8 + add r7,sp,#128 + vld1.32 {d4,d5,d6,d7},[r3]! + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmlal.u32 q9,d28,d1[1] + vadd.u64 d29,d29,d12 + vmlal.u32 q10,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q11,d28,d2[1] + vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q6,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q7,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q8,d29,d5[0] + vshr.u64 d12,d12,#16 + vmlal.u32 q9,d29,d5[1] + vmlal.u32 q10,d29,d6[0] + vadd.u64 d12,d12,d13 + vmlal.u32 q11,d29,d6[1] + vshr.u64 d12,d12,#16 + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vadd.u64 d14,d14,d12 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0] + vmlal.u32 q7,d28,d0[0] + vld1.64 {q6},[r6,:128]! + vmlal.u32 q8,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q9,d28,d1[0] + vshl.i64 d29,d15,#16 + vmlal.u32 q10,d28,d1[1] + vadd.u64 d29,d29,d14 + vmlal.u32 q11,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q12,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1] + vmlal.u32 q13,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q6,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q7,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q8,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q9,d29,d5[0] + vshr.u64 d14,d14,#16 + vmlal.u32 q10,d29,d5[1] + vmlal.u32 q11,d29,d6[0] + vadd.u64 d14,d14,d15 + vmlal.u32 q12,d29,d6[1] + vshr.u64 d14,d14,#16 + vmlal.u32 q13,d29,d7[0] + vmlal.u32 q6,d29,d7[1] + vadd.u64 d16,d16,d14 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1] + vmlal.u32 q8,d28,d0[0] + vld1.64 {q7},[r6,:128]! + vmlal.u32 q9,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q10,d28,d1[0] + vshl.i64 d29,d17,#16 + vmlal.u32 q11,d28,d1[1] + vadd.u64 d29,d29,d16 + vmlal.u32 q12,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q13,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2] + vmlal.u32 q6,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q7,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q8,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q9,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q10,d29,d5[0] + vshr.u64 d16,d16,#16 + vmlal.u32 q11,d29,d5[1] + vmlal.u32 q12,d29,d6[0] + vadd.u64 d16,d16,d17 + vmlal.u32 q13,d29,d6[1] + vshr.u64 d16,d16,#16 + vmlal.u32 q6,d29,d7[0] + vmlal.u32 q7,d29,d7[1] + vadd.u64 d18,d18,d16 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2] + vmlal.u32 q9,d28,d0[0] + vld1.64 {q8},[r6,:128]! + vmlal.u32 q10,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q11,d28,d1[0] + vshl.i64 d29,d19,#16 + vmlal.u32 q12,d28,d1[1] + vadd.u64 d29,d29,d18 + vmlal.u32 q13,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q6,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3] + vmlal.u32 q7,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q8,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q9,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q10,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q11,d29,d5[0] + vshr.u64 d18,d18,#16 + vmlal.u32 q12,d29,d5[1] + vmlal.u32 q13,d29,d6[0] + vadd.u64 d18,d18,d19 + vmlal.u32 q6,d29,d6[1] + vshr.u64 d18,d18,#16 + vmlal.u32 q7,d29,d7[0] + vmlal.u32 q8,d29,d7[1] + vadd.u64 d20,d20,d18 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3] + vmlal.u32 q10,d28,d0[0] + vld1.64 {q9},[r6,:128]! + vmlal.u32 q11,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q12,d28,d1[0] + vshl.i64 d29,d21,#16 + vmlal.u32 q13,d28,d1[1] + vadd.u64 d29,d29,d20 + vmlal.u32 q6,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q7,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4] + vmlal.u32 q8,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q9,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q10,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q11,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q12,d29,d5[0] + vshr.u64 d20,d20,#16 + vmlal.u32 q13,d29,d5[1] + vmlal.u32 q6,d29,d6[0] + vadd.u64 d20,d20,d21 + vmlal.u32 q7,d29,d6[1] + vshr.u64 d20,d20,#16 + vmlal.u32 q8,d29,d7[0] + vmlal.u32 q9,d29,d7[1] + vadd.u64 d22,d22,d20 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4] + vmlal.u32 q11,d28,d0[0] + vld1.64 {q10},[r6,:128]! + vmlal.u32 q12,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q13,d28,d1[0] + vshl.i64 d29,d23,#16 + vmlal.u32 q6,d28,d1[1] + vadd.u64 d29,d29,d22 + vmlal.u32 q7,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q8,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5] + vmlal.u32 q9,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q10,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q11,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q12,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q13,d29,d5[0] + vshr.u64 d22,d22,#16 + vmlal.u32 q6,d29,d5[1] + vmlal.u32 q7,d29,d6[0] + vadd.u64 d22,d22,d23 + vmlal.u32 q8,d29,d6[1] + vshr.u64 d22,d22,#16 + vmlal.u32 q9,d29,d7[0] + vmlal.u32 q10,d29,d7[1] + vadd.u64 d24,d24,d22 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5] + vmlal.u32 q12,d28,d0[0] + vld1.64 {q11},[r6,:128]! + vmlal.u32 q13,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q6,d28,d1[0] + vshl.i64 d29,d25,#16 + vmlal.u32 q7,d28,d1[1] + vadd.u64 d29,d29,d24 + vmlal.u32 q8,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q9,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6] + vmlal.u32 q10,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q11,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q12,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q13,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q6,d29,d5[0] + vshr.u64 d24,d24,#16 + vmlal.u32 q7,d29,d5[1] + vmlal.u32 q8,d29,d6[0] + vadd.u64 d24,d24,d25 + vmlal.u32 q9,d29,d6[1] + vshr.u64 d24,d24,#16 + vmlal.u32 q10,d29,d7[0] + vmlal.u32 q11,d29,d7[1] + vadd.u64 d26,d26,d24 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6] + vmlal.u32 q13,d28,d0[0] + vld1.64 {q12},[r6,:128]! + vmlal.u32 q6,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q7,d28,d1[0] + vshl.i64 d29,d27,#16 + vmlal.u32 q8,d28,d1[1] + vadd.u64 d29,d29,d26 + vmlal.u32 q9,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q10,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7] + vmlal.u32 q11,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q12,d28,d3[1] + vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 q13,d29,d4[0] + vld1.32 {d0,d1,d2,d3},[r1]! + vmlal.u32 q6,d29,d4[1] + vmlal.u32 q7,d29,d5[0] + vshr.u64 d26,d26,#16 + vmlal.u32 q8,d29,d5[1] + vmlal.u32 q9,d29,d6[0] + vadd.u64 d26,d26,d27 + vmlal.u32 q10,d29,d6[1] + vshr.u64 d26,d26,#16 + vmlal.u32 q11,d29,d7[0] + vmlal.u32 q12,d29,d7[1] + vadd.u64 d12,d12,d26 + vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7] + add r10,sp,#8 @ rewind + sub r8,r5,#8 + b .LNEON_8n_inner + +.align 4 +.LNEON_8n_inner: + subs r8,r8,#8 + vmlal.u32 q6,d28,d0[0] + vld1.64 {q13},[r6,:128] + vmlal.u32 q7,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0] + vmlal.u32 q8,d28,d1[0] + vld1.32 {d4,d5,d6,d7},[r3]! + vmlal.u32 q9,d28,d1[1] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vmlal.u32 q13,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1] + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + vmlal.u32 q10,d29,d6[0] + vmlal.u32 q11,d29,d6[1] + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vst1.64 {q6},[r7,:128]! + vmlal.u32 q7,d28,d0[0] + vld1.64 {q6},[r6,:128] + vmlal.u32 q8,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1] + vmlal.u32 q9,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q10,d28,d1[1] + vmlal.u32 q11,d28,d2[0] + vmlal.u32 q12,d28,d2[1] + vmlal.u32 q13,d28,d3[0] + vmlal.u32 q6,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2] + vmlal.u32 q7,d29,d4[0] + vmlal.u32 q8,d29,d4[1] + vmlal.u32 q9,d29,d5[0] + vmlal.u32 q10,d29,d5[1] + vmlal.u32 q11,d29,d6[0] + vmlal.u32 q12,d29,d6[1] + vmlal.u32 q13,d29,d7[0] + vmlal.u32 q6,d29,d7[1] + vst1.64 {q7},[r7,:128]! + vmlal.u32 q8,d28,d0[0] + vld1.64 {q7},[r6,:128] + vmlal.u32 q9,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2] + vmlal.u32 q10,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q11,d28,d1[1] + vmlal.u32 q12,d28,d2[0] + vmlal.u32 q13,d28,d2[1] + vmlal.u32 q6,d28,d3[0] + vmlal.u32 q7,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3] + vmlal.u32 q8,d29,d4[0] + vmlal.u32 q9,d29,d4[1] + vmlal.u32 q10,d29,d5[0] + vmlal.u32 q11,d29,d5[1] + vmlal.u32 q12,d29,d6[0] + vmlal.u32 q13,d29,d6[1] + vmlal.u32 q6,d29,d7[0] + vmlal.u32 q7,d29,d7[1] + vst1.64 {q8},[r7,:128]! + vmlal.u32 q9,d28,d0[0] + vld1.64 {q8},[r6,:128] + vmlal.u32 q10,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3] + vmlal.u32 q11,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q12,d28,d1[1] + vmlal.u32 q13,d28,d2[0] + vmlal.u32 q6,d28,d2[1] + vmlal.u32 q7,d28,d3[0] + vmlal.u32 q8,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4] + vmlal.u32 q9,d29,d4[0] + vmlal.u32 q10,d29,d4[1] + vmlal.u32 q11,d29,d5[0] + vmlal.u32 q12,d29,d5[1] + vmlal.u32 q13,d29,d6[0] + vmlal.u32 q6,d29,d6[1] + vmlal.u32 q7,d29,d7[0] + vmlal.u32 q8,d29,d7[1] + vst1.64 {q9},[r7,:128]! + vmlal.u32 q10,d28,d0[0] + vld1.64 {q9},[r6,:128] + vmlal.u32 q11,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4] + vmlal.u32 q12,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q13,d28,d1[1] + vmlal.u32 q6,d28,d2[0] + vmlal.u32 q7,d28,d2[1] + vmlal.u32 q8,d28,d3[0] + vmlal.u32 q9,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5] + vmlal.u32 q10,d29,d4[0] + vmlal.u32 q11,d29,d4[1] + vmlal.u32 q12,d29,d5[0] + vmlal.u32 q13,d29,d5[1] + vmlal.u32 q6,d29,d6[0] + vmlal.u32 q7,d29,d6[1] + vmlal.u32 q8,d29,d7[0] + vmlal.u32 q9,d29,d7[1] + vst1.64 {q10},[r7,:128]! + vmlal.u32 q11,d28,d0[0] + vld1.64 {q10},[r6,:128] + vmlal.u32 q12,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5] + vmlal.u32 q13,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q6,d28,d1[1] + vmlal.u32 q7,d28,d2[0] + vmlal.u32 q8,d28,d2[1] + vmlal.u32 q9,d28,d3[0] + vmlal.u32 q10,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6] + vmlal.u32 q11,d29,d4[0] + vmlal.u32 q12,d29,d4[1] + vmlal.u32 q13,d29,d5[0] + vmlal.u32 q6,d29,d5[1] + vmlal.u32 q7,d29,d6[0] + vmlal.u32 q8,d29,d6[1] + vmlal.u32 q9,d29,d7[0] + vmlal.u32 q10,d29,d7[1] + vst1.64 {q11},[r7,:128]! + vmlal.u32 q12,d28,d0[0] + vld1.64 {q11},[r6,:128] + vmlal.u32 q13,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6] + vmlal.u32 q6,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q7,d28,d1[1] + vmlal.u32 q8,d28,d2[0] + vmlal.u32 q9,d28,d2[1] + vmlal.u32 q10,d28,d3[0] + vmlal.u32 q11,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7] + vmlal.u32 q12,d29,d4[0] + vmlal.u32 q13,d29,d4[1] + vmlal.u32 q6,d29,d5[0] + vmlal.u32 q7,d29,d5[1] + vmlal.u32 q8,d29,d6[0] + vmlal.u32 q9,d29,d6[1] + vmlal.u32 q10,d29,d7[0] + vmlal.u32 q11,d29,d7[1] + vst1.64 {q12},[r7,:128]! + vmlal.u32 q13,d28,d0[0] + vld1.64 {q12},[r6,:128] + vmlal.u32 q6,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7] + vmlal.u32 q7,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q8,d28,d1[1] + vmlal.u32 q9,d28,d2[0] + vmlal.u32 q10,d28,d2[1] + vmlal.u32 q11,d28,d3[0] + vmlal.u32 q12,d28,d3[1] + it eq + subeq r1,r1,r5,lsl#2 @ rewind + vmlal.u32 q13,d29,d4[0] + vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 q6,d29,d4[1] + vld1.32 {d0,d1,d2,d3},[r1]! + vmlal.u32 q7,d29,d5[0] + add r10,sp,#8 @ rewind + vmlal.u32 q8,d29,d5[1] + vmlal.u32 q9,d29,d6[0] + vmlal.u32 q10,d29,d6[1] + vmlal.u32 q11,d29,d7[0] + vst1.64 {q13},[r7,:128]! + vmlal.u32 q12,d29,d7[1] + + bne .LNEON_8n_inner + add r6,sp,#128 + vst1.64 {q6,q7},[r7,:256]! + veor q2,q2,q2 @ d4-d5 + vst1.64 {q8,q9},[r7,:256]! + veor q3,q3,q3 @ d6-d7 + vst1.64 {q10,q11},[r7,:256]! + vst1.64 {q12},[r7,:128] + + subs r9,r9,#8 + vld1.64 {q6,q7},[r6,:256]! + vld1.64 {q8,q9},[r6,:256]! + vld1.64 {q10,q11},[r6,:256]! + vld1.64 {q12,q13},[r6,:256]! + + itt ne + subne r3,r3,r5,lsl#2 @ rewind + bne .LNEON_8n_outer + + add r7,sp,#128 + vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame + vshr.u64 d10,d12,#16 + vst1.64 {q2,q3},[sp,:256]! + vadd.u64 d13,d13,d10 + vst1.64 {q2,q3}, [sp,:256]! + vshr.u64 d10,d13,#16 + vst1.64 {q2,q3}, [sp,:256]! + vzip.16 d12,d13 + + mov r8,r5 + b .LNEON_tail_entry + +.align 4 +.LNEON_tail: + vadd.u64 d12,d12,d10 + vshr.u64 d10,d12,#16 + vld1.64 {q8,q9}, [r6, :256]! + vadd.u64 d13,d13,d10 + vld1.64 {q10,q11}, [r6, :256]! + vshr.u64 d10,d13,#16 + vld1.64 {q12,q13}, [r6, :256]! + vzip.16 d12,d13 + +.LNEON_tail_entry: + vadd.u64 d14,d14,d10 + vst1.32 {d12[0]}, [r7, :32]! + vshr.u64 d10,d14,#16 + vadd.u64 d15,d15,d10 + vshr.u64 d10,d15,#16 + vzip.16 d14,d15 + vadd.u64 d16,d16,d10 + vst1.32 {d14[0]}, [r7, :32]! + vshr.u64 d10,d16,#16 + vadd.u64 d17,d17,d10 + vshr.u64 d10,d17,#16 + vzip.16 d16,d17 + vadd.u64 d18,d18,d10 + vst1.32 {d16[0]}, [r7, :32]! + vshr.u64 d10,d18,#16 + vadd.u64 d19,d19,d10 + vshr.u64 d10,d19,#16 + vzip.16 d18,d19 + vadd.u64 d20,d20,d10 + vst1.32 {d18[0]}, [r7, :32]! + vshr.u64 d10,d20,#16 + vadd.u64 d21,d21,d10 + vshr.u64 d10,d21,#16 + vzip.16 d20,d21 + vadd.u64 d22,d22,d10 + vst1.32 {d20[0]}, [r7, :32]! + vshr.u64 d10,d22,#16 + vadd.u64 d23,d23,d10 + vshr.u64 d10,d23,#16 + vzip.16 d22,d23 + vadd.u64 d24,d24,d10 + vst1.32 {d22[0]}, [r7, :32]! + vshr.u64 d10,d24,#16 + vadd.u64 d25,d25,d10 + vshr.u64 d10,d25,#16 + vzip.16 d24,d25 + vadd.u64 d26,d26,d10 + vst1.32 {d24[0]}, [r7, :32]! + vshr.u64 d10,d26,#16 + vadd.u64 d27,d27,d10 + vshr.u64 d10,d27,#16 + vzip.16 d26,d27 + vld1.64 {q6,q7}, [r6, :256]! + subs r8,r8,#8 + vst1.32 {d26[0]}, [r7, :32]! + bne .LNEON_tail + + vst1.32 {d10[0]}, [r7, :32] @ top-most bit + sub r3,r3,r5,lsl#2 @ rewind r3 + subs r1,sp,#0 @ clear carry flag + add r2,sp,r5,lsl#2 + +.LNEON_sub: + ldmia r1!, {r4,r5,r6,r7} + ldmia r3!, {r8,r9,r10,r11} + sbcs r8, r4,r8 + sbcs r9, r5,r9 + sbcs r10,r6,r10 + sbcs r11,r7,r11 + teq r1,r2 @ preserves carry + stmia r0!, {r8,r9,r10,r11} + bne .LNEON_sub + + ldr r10, [r1] @ load top-most bit + mov r11,sp + veor q0,q0,q0 + sub r11,r2,r11 @ this is num*4 + veor q1,q1,q1 + mov r1,sp + sub r0,r0,r11 @ rewind r0 + mov r3,r2 @ second 3/4th of frame + sbcs r10,r10,#0 @ result is carry flag + +.LNEON_copy_n_zap: + ldmia r1!, {r4,r5,r6,r7} + ldmia r0, {r8,r9,r10,r11} + it cc + movcc r8, r4 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + it cc + movcc r11,r7 + ldmia r1, {r4,r5,r6,r7} + stmia r0!, {r8,r9,r10,r11} + sub r1,r1,#16 + ldmia r0, {r8,r9,r10,r11} + it cc + movcc r8, r4 + vst1.64 {q0,q1}, [r1,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + it cc + movcc r11,r7 + teq r1,r2 @ preserves carry + stmia r0!, {r8,r9,r10,r11} + bne .LNEON_copy_n_zap + + mov sp,ip + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + bx lr @ bx lr +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +#endif +.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#if __ARM_MAX_ARCH__>=7 +.comm OPENSSL_armcap_P,4,4 +.hidden OPENSSL_armcap_P +#endif +#endif diff --git a/packager/third_party/boringssl/linux-arm/crypto/aes/bsaes-armv7.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S similarity index 98% rename from packager/third_party/boringssl/linux-arm/crypto/aes/bsaes-armv7.S rename to packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S index abb414d549..f9c6de73ff 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/aes/bsaes-armv7.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S @@ -1,4 +1,11 @@ #if defined(__arm__) +@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @@ -77,12 +84,13 @@ .thumb #else .code 32 +# undef __thumb2__ #endif .type _bsaes_decrypt8,%function .align 4 _bsaes_decrypt8: - adr r6,_bsaes_decrypt8 + adr r6,. vldmia r4!, {q9} @ round 0 key #ifdef __APPLE__ adr r6,.LM0ISR @@ -573,7 +581,7 @@ _bsaes_const: .type _bsaes_encrypt8,%function .align 4 _bsaes_encrypt8: - adr r6,_bsaes_encrypt8 + adr r6,. vldmia r4!, {q9} @ round 0 key #ifdef __APPLE__ adr r6,.LM0SR @@ -1008,7 +1016,7 @@ _bsaes_encrypt8_bitslice: .type _bsaes_key_convert,%function .align 4 _bsaes_key_convert: - adr r6,_bsaes_key_convert + adr r6,. vld1.8 {q7}, [r4]! @ load round 0 key #ifdef __APPLE__ adr r6,.LM0 @@ -1313,7 +1321,7 @@ bsaes_cbc_encrypt: vmov q4,q15 @ just in case ensure that IV vmov q5,q0 @ and input are preserved bl AES_decrypt - vld1.8 {q0}, [r9,:64] @ load result + vld1.8 {q0}, [r9] @ load result veor q0, q0, q4 @ ^= IV vmov q15, q5 @ q5 holds input vst1.8 {q0}, [r10] @ write output @@ -1843,8 +1851,6 @@ bsaes_xts_encrypt: b .Lxts_enc_done .align 4 .Lxts_enc_6: - vst1.64 {q14}, [r0,:128] @ next round tweak - veor q4, q4, q12 #ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule @@ -1880,8 +1886,6 @@ bsaes_xts_encrypt: .align 5 .Lxts_enc_5: - vst1.64 {q13}, [r0,:128] @ next round tweak - veor q3, q3, q11 #ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule @@ -1910,8 +1914,6 @@ bsaes_xts_encrypt: b .Lxts_enc_done .align 4 .Lxts_enc_4: - vst1.64 {q12}, [r0,:128] @ next round tweak - veor q2, q2, q10 #ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule @@ -1937,8 +1939,6 @@ bsaes_xts_encrypt: b .Lxts_enc_done .align 4 .Lxts_enc_3: - vst1.64 {q11}, [r0,:128] @ next round tweak - veor q1, q1, q9 #ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule @@ -1963,8 +1963,6 @@ bsaes_xts_encrypt: b .Lxts_enc_done .align 4 .Lxts_enc_2: - vst1.64 {q10}, [r0,:128] @ next round tweak - veor q0, q0, q8 #ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule @@ -1987,7 +1985,7 @@ bsaes_xts_encrypt: .align 4 .Lxts_enc_1: mov r0, sp - veor q0, q8 + veor q0, q0, q8 mov r1, sp vst1.8 {q0}, [sp,:128] mov r2, r10 @@ -2376,8 +2374,6 @@ bsaes_xts_decrypt: b .Lxts_dec_done .align 4 .Lxts_dec_5: - vst1.64 {q13}, [r0,:128] @ next round tweak - veor q3, q3, q11 #ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule @@ -2406,8 +2402,6 @@ bsaes_xts_decrypt: b .Lxts_dec_done .align 4 .Lxts_dec_4: - vst1.64 {q12}, [r0,:128] @ next round tweak - veor q2, q2, q10 #ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule @@ -2433,8 +2427,6 @@ bsaes_xts_decrypt: b .Lxts_dec_done .align 4 .Lxts_dec_3: - vst1.64 {q11}, [r0,:128] @ next round tweak - veor q1, q1, q9 #ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule @@ -2459,8 +2451,6 @@ bsaes_xts_decrypt: b .Lxts_dec_done .align 4 .Lxts_dec_2: - vst1.64 {q10}, [r0,:128] @ next round tweak - veor q0, q0, q8 #ifndef BSAES_ASM_EXTENDED_KEY add r4, sp, #0x90 @ pass key schedule @@ -2483,12 +2473,12 @@ bsaes_xts_decrypt: .align 4 .Lxts_dec_1: mov r0, sp - veor q0, q8 + veor q0, q0, q8 mov r1, sp vst1.8 {q0}, [sp,:128] + mov r5, r2 @ preserve magic mov r2, r10 mov r4, r3 @ preserve fp - mov r5, r2 @ preserve magic bl AES_decrypt diff --git a/packager/third_party/boringssl/linux-arm/crypto/modes/ghash-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S similarity index 95% rename from packager/third_party/boringssl/linux-arm/crypto/modes/ghash-armv4.S rename to packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S index 791b28906c..5f8b50d551 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/modes/ghash-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S @@ -1,10 +1,15 @@ #if defined(__arm__) #include -.syntax unified - .text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#endif +#if defined(__thumb2__) +.thumb +#else .code 32 +#endif #ifdef __clang__ #define ldrplb ldrbpl @@ -22,20 +27,28 @@ rem_4bit: .type rem_4bit_get,%function rem_4bit_get: - sub r2,pc,#8 - sub r2,r2,#32 @ &rem_4bit +#if defined(__thumb2__) + adr r2,rem_4bit +#else + sub r2,pc,#8+32 @ &rem_4bit +#endif b .Lrem_4bit_got nop + nop .size rem_4bit_get,.-rem_4bit_get .globl gcm_ghash_4bit .hidden gcm_ghash_4bit .type gcm_ghash_4bit,%function +.align 4 gcm_ghash_4bit: - sub r12,pc,#8 +#if defined(__thumb2__) + adr r12,rem_4bit +#else + sub r12,pc,#8+48 @ &rem_4bit +#endif add r3,r2,r3 @ r3 to point at the end stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too - sub r12,r12,#48 @ &rem_4bit ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ... stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack @@ -82,7 +95,10 @@ gcm_ghash_4bit: eor r5,r5,r6,lsl#28 ldrh r8,[sp,r12] @ rem_4bit[rem] eor r6,r10,r6,lsr#4 - ldrbpl r12,[r2,r3] +#ifdef __thumb2__ + it pl +#endif + ldrplb r12,[r2,r3] eor r6,r6,r7,lsl#28 eor r7,r11,r7,lsr#4 @@ -92,15 +108,24 @@ gcm_ghash_4bit: add r14,r14,r14 ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] eor r4,r8,r4,lsr#4 - ldrbpl r8,[r0,r3] +#ifdef __thumb2__ + it pl +#endif + ldrplb r8,[r0,r3] eor r4,r4,r5,lsl#28 eor r5,r9,r5,lsr#4 ldrh r9,[sp,r14] eor r5,r5,r6,lsl#28 eor r6,r10,r6,lsr#4 eor r6,r6,r7,lsl#28 +#ifdef __thumb2__ + it pl +#endif eorpl r12,r12,r8 eor r7,r11,r7,lsr#4 +#ifdef __thumb2__ + itt pl +#endif andpl r14,r12,#0xf0 andpl r12,r12,#0x0f eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem] @@ -138,7 +163,11 @@ gcm_ghash_4bit: strb r10,[r0,#8+1] strb r11,[r0,#8] #endif - ldrbne r12,[r2,#15] + +#ifdef __thumb2__ + it ne +#endif + ldrneb r12,[r2,#15] #if __ARM_ARCH__>=7 && defined(__ARMEL__) rev r6,r6 str r6,[r0,#4] @@ -226,7 +255,10 @@ gcm_gmult_4bit: eor r5,r5,r6,lsl#28 ldrh r8,[r2,r12] @ rem_4bit[rem] eor r6,r10,r6,lsr#4 - ldrbpl r12,[r0,r3] +#ifdef __thumb2__ + it pl +#endif + ldrplb r12,[r0,r3] eor r6,r6,r7,lsl#28 eor r7,r11,r7,lsr#4 @@ -243,6 +275,9 @@ gcm_gmult_4bit: eor r6,r10,r6,lsr#4 eor r6,r6,r7,lsl#28 eor r7,r11,r7,lsr#4 +#ifdef __thumb2__ + itt pl +#endif andpl r14,r12,#0xf0 andpl r12,r12,#0x0f eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] diff --git a/packager/third_party/boringssl/linux-arm/crypto/modes/ghashv8-armx32.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S similarity index 99% rename from packager/third_party/boringssl/linux-arm/crypto/modes/ghashv8-armx32.S rename to packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S index 0e1e631486..e83a9c7313 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/modes/ghashv8-armx32.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S @@ -4,6 +4,7 @@ .text .fpu neon .code 32 +#undef __thumb2__ .globl gcm_init_v8 .hidden gcm_init_v8 .type gcm_init_v8,%function diff --git a/packager/third_party/boringssl/linux-arm/crypto/sha/sha1-armv4-large.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S similarity index 88% rename from packager/third_party/boringssl/linux-arm/crypto/sha/sha1-armv4-large.S rename to packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S index 36955faf19..a5d88f71e2 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/sha/sha1-armv4-large.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S @@ -2,7 +2,12 @@ #include .text +#if defined(__thumb2__) +.syntax unified +.thumb +#else .code 32 +#endif .globl sha1_block_data_order .hidden sha1_block_data_order @@ -11,7 +16,8 @@ .align 5 sha1_block_data_order: #if __ARM_MAX_ARCH__>=7 - sub r3,pc,#8 @ sha1_block_data_order +.Lsha1_block: + adr r3,.Lsha1_block ldr r12,.LOPENSSL_armcap ldr r12,[r3,r12] @ OPENSSL_armcap_P #ifdef __APPLE__ @@ -158,7 +164,12 @@ sha1_block_data_order: eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) str r9,[r14,#-4]! add r3,r3,r10 @ E+=F_00_19(B,C,D) +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else teq r14,sp +#endif bne .L_00_15 @ [((11+4)*5+2)*3] sub sp,sp,#25*4 #if __ARM_ARCH__<7 @@ -338,7 +349,12 @@ sha1_block_data_order: @ F_xx_xx add r3,r3,r9 @ E+=X[i] add r3,r3,r10 @ E+=F_20_39(B,C,D) +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else teq r14,sp @ preserve carry +#endif bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes @@ -430,7 +446,12 @@ sha1_block_data_order: add r3,r3,r9 @ E+=X[i] add r3,r3,r10 @ E+=F_40_59(B,C,D) add r3,r3,r11,ror#2 +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else teq r14,sp +#endif bne .L_40_59 @ [+((12+5)*5+2)*4] ldr r8,.LK_60_79 @@ -466,7 +487,7 @@ sha1_block_data_order: .LK_60_79:.word 0xca62c1d6 #if __ARM_MAX_ARCH__>=7 .LOPENSSL_armcap: -.word OPENSSL_armcap_P-sha1_block_data_order +.word OPENSSL_armcap_P-.Lsha1_block #endif .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 2 @@ -484,12 +505,12 @@ sha1_block_data_order_neon: @ dmb @ errata #451034 on early Cortex A8 @ vstmdb sp!,{d8-d15} @ ABI specification says so mov r14,sp - sub sp,sp,#64 @ alloca + sub r12,sp,#64 adr r8,.LK_00_19 - bic sp,sp,#15 @ align for 128-bit stores + bic r12,r12,#15 @ align for 128-bit stores ldmia r0,{r3,r4,r5,r6,r7} @ load context - mov r12,sp + mov sp,r12 @ alloca vld1.8 {q0,q1},[r1]! @ handles unaligned veor q15,q15,q15 @@ -1182,6 +1203,7 @@ sha1_block_data_order_neon: sub r12,r12,#64 teq r1,r2 sub r8,r8,#16 + it eq subeq r1,r1,#64 vld1.8 {q0,q1},[r1]! ldr r9,[sp,#4] @@ -1311,10 +1333,13 @@ sha1_block_data_order_neon: add r4,r4,r10 add r5,r5,r11 add r6,r6,r12 + it eq moveq sp,r14 add r7,r7,r9 + it ne ldrne r9,[sp] stmia r0,{r3,r4,r5,r6,r7} + itt ne addne r12,sp,#3*16 bne .Loop_neon @@ -1323,6 +1348,13 @@ sha1_block_data_order_neon: .size sha1_block_data_order_neon,.-sha1_block_data_order_neon #endif #if __ARM_MAX_ARCH__>=7 + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xf,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# endif + .type sha1_block_data_order_armv8,%function .align 5 sha1_block_data_order_armv8: @@ -1352,98 +1384,98 @@ sha1_block_data_order_armv8: vadd.i32 q13,q8,q5 vrev32.8 q7,q7 -.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 0 -.byte 0x68,0x0c,0x02,0xf2 @ sha1c q0,q1,q12 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0 + INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12 vadd.i32 q12,q8,q6 -.byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 -.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 1 -.byte 0x6a,0x0c,0x06,0xf2 @ sha1c q0,q3,q13 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1 + INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 vadd.i32 q13,q8,q7 -.byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 -.byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 -.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 2 -.byte 0x68,0x0c,0x04,0xf2 @ sha1c q0,q2,q12 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2 + INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 vadd.i32 q12,q8,q4 -.byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 -.byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 -.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 3 -.byte 0x6a,0x0c,0x06,0xf2 @ sha1c q0,q3,q13 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3 + INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 vadd.i32 q13,q9,q5 -.byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 -.byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 -.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 4 -.byte 0x68,0x0c,0x04,0xf2 @ sha1c q0,q2,q12 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4 + INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 vadd.i32 q12,q9,q6 -.byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 -.byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 -.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 5 -.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 vadd.i32 q13,q9,q7 -.byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 -.byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 -.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 6 -.byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 vadd.i32 q12,q9,q4 -.byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 -.byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 -.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 7 -.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 vadd.i32 q13,q9,q5 -.byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 -.byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 -.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 8 -.byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 vadd.i32 q12,q10,q6 -.byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 -.byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 -.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 9 -.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 vadd.i32 q13,q10,q7 -.byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 -.byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 -.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 10 -.byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 vadd.i32 q12,q10,q4 -.byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 -.byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 -.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 11 -.byte 0x6a,0x0c,0x26,0xf2 @ sha1m q0,q3,q13 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11 + INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 vadd.i32 q13,q10,q5 -.byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 -.byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 -.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 12 -.byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 vadd.i32 q12,q10,q6 -.byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 -.byte 0x4c,0x8c,0x3a,0xf2 @ sha1su0 q4,q5,q6 -.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 13 -.byte 0x6a,0x0c,0x26,0xf2 @ sha1m q0,q3,q13 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13 + INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 vadd.i32 q13,q11,q7 -.byte 0x8e,0x83,0xba,0xf3 @ sha1su1 q4,q7 -.byte 0x4e,0xac,0x3c,0xf2 @ sha1su0 q5,q6,q7 -.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 14 -.byte 0x68,0x0c,0x24,0xf2 @ sha1m q0,q2,q12 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 vadd.i32 q12,q11,q4 -.byte 0x88,0xa3,0xba,0xf3 @ sha1su1 q5,q4 -.byte 0x48,0xcc,0x3e,0xf2 @ sha1su0 q6,q7,q4 -.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 15 -.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 vadd.i32 q13,q11,q5 -.byte 0x8a,0xc3,0xba,0xf3 @ sha1su1 q6,q5 -.byte 0x4a,0xec,0x38,0xf2 @ sha1su0 q7,q4,q5 -.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 16 -.byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 vadd.i32 q12,q11,q6 -.byte 0x8c,0xe3,0xba,0xf3 @ sha1su1 q7,q6 -.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 17 -.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 vadd.i32 q13,q11,q7 -.byte 0xc0,0x62,0xb9,0xf3 @ sha1h q3,q0 @ 18 -.byte 0x68,0x0c,0x14,0xf2 @ sha1p q0,q2,q12 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 -.byte 0xc0,0x42,0xb9,0xf3 @ sha1h q2,q0 @ 19 -.byte 0x6a,0x0c,0x16,0xf2 @ sha1p q0,q3,q13 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 vadd.i32 q1,q1,q2 vadd.i32 q0,q0,q14 diff --git a/packager/third_party/boringssl/linux-arm/crypto/sha/sha256-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S similarity index 99% rename from packager/third_party/boringssl/linux-arm/crypto/sha/sha256-armv4.S rename to packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S index 6040041322..f37fd7c7cf 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/sha/sha256-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S @@ -1,4 +1,11 @@ #if defined(__arm__) +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @@ -45,16 +52,11 @@ #endif .text -#if __ARM_ARCH__<7 -.code 32 -#else +#if defined(__thumb2__) .syntax unified -# if defined(__thumb2__) && !defined(__APPLE__) -# define adrl adr .thumb -# else +#else .code 32 -# endif #endif .type K256,%object @@ -89,10 +91,10 @@ K256: .type sha256_block_data_order,%function sha256_block_data_order: .Lsha256_block_data_order: -#if __ARM_ARCH__<7 +#if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha256_block_data_order #else - adr r3,sha256_block_data_order + adr r3,.Lsha256_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap @@ -1878,13 +1880,14 @@ sha256_block_data_order: .globl sha256_block_data_order_neon .hidden sha256_block_data_order_neon .type sha256_block_data_order_neon,%function -.align 4 +.align 5 +.skip 16 sha256_block_data_order_neon: .LNEON: stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} sub r11,sp,#16*4+16 - adrl r14,K256 + adr r14,K256 bic r11,r11,#15 @ align for 128-bit stores mov r12,sp mov sp,r11 @ alloca @@ -2660,7 +2663,7 @@ sha256_block_data_order_neon: #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -# if defined(__thumb2__) && !defined(__APPLE__) +# if defined(__thumb2__) # define INST(a,b,c,d) .byte c,d|0xc,a,b # else # define INST(a,b,c,d) .byte a,b,c,d @@ -2671,16 +2674,11 @@ sha256_block_data_order_neon: sha256_block_data_order_armv8: .LARMv8: vld1.32 {q0,q1},[r0] -# ifdef __APPLE__ sub r3,r3,#256+32 -# elif defined(__thumb2__) - adr r3,.LARMv8 - sub r3,r3,#.LARMv8-K256 -# else - adrl r3,K256 -# endif add r2,r1,r2,lsl#6 @ len to point at the end of inp + b .Loop_v8 +.align 4 .Loop_v8: vld1.8 {q8,q9},[r1]! vld1.8 {q10,q11},[r1]! diff --git a/packager/third_party/boringssl/linux-arm/crypto/sha/sha512-armv4.S b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S similarity index 98% rename from packager/third_party/boringssl/linux-arm/crypto/sha/sha512-armv4.S rename to packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S index 93a7bf8566..bbeddf9220 100644 --- a/packager/third_party/boringssl/linux-arm/crypto/sha/sha512-armv4.S +++ b/packager/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S @@ -1,4 +1,11 @@ #if defined(__arm__) +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the OpenSSL license (the "License"). You may not use +@ this file except in compliance with the License. You can obtain a copy +@ in the file LICENSE in the source distribution or at +@ https://www.openssl.org/source/license.html + @ ==================================================================== @ Written by Andy Polyakov for the OpenSSL @@ -68,16 +75,12 @@ #endif .text -#if __ARM_ARCH__<7 || defined(__APPLE__) -.code 32 -#else +#if defined(__thumb2__) .syntax unified -# ifdef __thumb2__ -# define adrl adr .thumb -# else +# define adrl adr +#else .code 32 -# endif #endif .type K512,%object @@ -137,10 +140,10 @@ K512: .type sha512_block_data_order,%function sha512_block_data_order: .Lsha512_block_data_order: -#if __ARM_ARCH__<7 +#if __ARM_ARCH__<7 && !defined(__thumb2__) sub r3,pc,#8 @ sha512_block_data_order #else - adr r3,sha512_block_data_order + adr r3,.Lsha512_block_data_order #endif #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) ldr r12,.LOPENSSL_armcap diff --git a/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S b/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S index d3c39ace9b..519081bb98 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "chacha-x86.S" .text .globl ChaCha20_ctr32 .hidden ChaCha20_ctr32 diff --git a/packager/third_party/boringssl/linux-x86/crypto/aes/aes-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S similarity index 99% rename from packager/third_party/boringssl/linux-x86/crypto/aes/aes-586.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S index 74282748ce..319ed627f5 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/aes/aes-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aes-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "aes-586.S" .text .hidden _x86_AES_encrypt_compact .type _x86_AES_encrypt_compact,@function diff --git a/packager/third_party/boringssl/linux-x86/crypto/aes/aesni-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S similarity index 99% rename from packager/third_party/boringssl/linux-x86/crypto/aes/aesni-x86.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S index aec110d4b1..cc53fa46df 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/aes/aesni-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "src/crypto/aes/asm/aesni-x86.S" .text .globl aesni_encrypt .hidden aesni_encrypt diff --git a/packager/third_party/boringssl/linux-x86/crypto/bn/bn-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S similarity index 99% rename from packager/third_party/boringssl/linux-x86/crypto/bn/bn-586.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S index 773beff9c1..cc067f717e 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/bn/bn-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "src/crypto/bn/asm/bn-586.S" .text .globl bn_mul_add_words .hidden bn_mul_add_words diff --git a/packager/third_party/boringssl/linux-x86/crypto/bn/co-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S similarity index 99% rename from packager/third_party/boringssl/linux-x86/crypto/bn/co-586.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S index e41c3a1dff..56834d0a6f 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/bn/co-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "src/crypto/bn/asm/co-586.S" .text .globl bn_mul_comba8 .hidden bn_mul_comba8 diff --git a/packager/third_party/boringssl/linux-x86/crypto/modes/ghash-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S similarity index 81% rename from packager/third_party/boringssl/linux-x86/crypto/modes/ghash-x86.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S index 28720889a1..a384d9a039 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/modes/ghash-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S @@ -1,211 +1,5 @@ #if defined(__i386__) -.file "ghash-x86.S" .text -.globl gcm_gmult_4bit_x86 -.hidden gcm_gmult_4bit_x86 -.type gcm_gmult_4bit_x86,@function -.align 16 -gcm_gmult_4bit_x86: -.L_gcm_gmult_4bit_x86_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - subl $84,%esp - movl 104(%esp),%edi - movl 108(%esp),%esi - movl (%edi),%ebp - movl 4(%edi),%edx - movl 8(%edi),%ecx - movl 12(%edi),%ebx - movl $0,16(%esp) - movl $471859200,20(%esp) - movl $943718400,24(%esp) - movl $610271232,28(%esp) - movl $1887436800,32(%esp) - movl $1822425088,36(%esp) - movl $1220542464,40(%esp) - movl $1423966208,44(%esp) - movl $3774873600,48(%esp) - movl $4246732800,52(%esp) - movl $3644850176,56(%esp) - movl $3311403008,60(%esp) - movl $2441084928,64(%esp) - movl $2376073216,68(%esp) - movl $2847932416,72(%esp) - movl $3051356160,76(%esp) - movl %ebp,(%esp) - movl %edx,4(%esp) - movl %ecx,8(%esp) - movl %ebx,12(%esp) - shrl $20,%ebx - andl $240,%ebx - movl 4(%esi,%ebx,1),%ebp - movl (%esi,%ebx,1),%edx - movl 12(%esi,%ebx,1),%ecx - movl 8(%esi,%ebx,1),%ebx - xorl %eax,%eax - movl $15,%edi - jmp .L000x86_loop -.align 16 -.L000x86_loop: - movb %bl,%al - shrdl $4,%ecx,%ebx - andb $15,%al - shrdl $4,%edx,%ecx - shrdl $4,%ebp,%edx - shrl $4,%ebp - xorl 16(%esp,%eax,4),%ebp - movb (%esp,%edi,1),%al - andb $240,%al - xorl 8(%esi,%eax,1),%ebx - xorl 12(%esi,%eax,1),%ecx - xorl (%esi,%eax,1),%edx - xorl 4(%esi,%eax,1),%ebp - decl %edi - js .L001x86_break - movb %bl,%al - shrdl $4,%ecx,%ebx - andb $15,%al - shrdl $4,%edx,%ecx - shrdl $4,%ebp,%edx - shrl $4,%ebp - xorl 16(%esp,%eax,4),%ebp - movb (%esp,%edi,1),%al - shlb $4,%al - xorl 8(%esi,%eax,1),%ebx - xorl 12(%esi,%eax,1),%ecx - xorl (%esi,%eax,1),%edx - xorl 4(%esi,%eax,1),%ebp - jmp .L000x86_loop -.align 16 -.L001x86_break: - bswap %ebx - bswap %ecx - bswap %edx - bswap %ebp - movl 104(%esp),%edi - movl %ebx,12(%edi) - movl %ecx,8(%edi) - movl %edx,4(%edi) - movl %ebp,(%edi) - addl $84,%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size gcm_gmult_4bit_x86,.-.L_gcm_gmult_4bit_x86_begin -.globl gcm_ghash_4bit_x86 -.hidden gcm_ghash_4bit_x86 -.type gcm_ghash_4bit_x86,@function -.align 16 -gcm_ghash_4bit_x86: -.L_gcm_ghash_4bit_x86_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - subl $84,%esp - movl 104(%esp),%ebx - movl 108(%esp),%esi - movl 112(%esp),%edi - movl 116(%esp),%ecx - addl %edi,%ecx - movl %ecx,116(%esp) - movl (%ebx),%ebp - movl 4(%ebx),%edx - movl 8(%ebx),%ecx - movl 12(%ebx),%ebx - movl $0,16(%esp) - movl $471859200,20(%esp) - movl $943718400,24(%esp) - movl $610271232,28(%esp) - movl $1887436800,32(%esp) - movl $1822425088,36(%esp) - movl $1220542464,40(%esp) - movl $1423966208,44(%esp) - movl $3774873600,48(%esp) - movl $4246732800,52(%esp) - movl $3644850176,56(%esp) - movl $3311403008,60(%esp) - movl $2441084928,64(%esp) - movl $2376073216,68(%esp) - movl $2847932416,72(%esp) - movl $3051356160,76(%esp) -.align 16 -.L002x86_outer_loop: - xorl 12(%edi),%ebx - xorl 8(%edi),%ecx - xorl 4(%edi),%edx - xorl (%edi),%ebp - movl %ebx,12(%esp) - movl %ecx,8(%esp) - movl %edx,4(%esp) - movl %ebp,(%esp) - shrl $20,%ebx - andl $240,%ebx - movl 4(%esi,%ebx,1),%ebp - movl (%esi,%ebx,1),%edx - movl 12(%esi,%ebx,1),%ecx - movl 8(%esi,%ebx,1),%ebx - xorl %eax,%eax - movl $15,%edi - jmp .L003x86_loop -.align 16 -.L003x86_loop: - movb %bl,%al - shrdl $4,%ecx,%ebx - andb $15,%al - shrdl $4,%edx,%ecx - shrdl $4,%ebp,%edx - shrl $4,%ebp - xorl 16(%esp,%eax,4),%ebp - movb (%esp,%edi,1),%al - andb $240,%al - xorl 8(%esi,%eax,1),%ebx - xorl 12(%esi,%eax,1),%ecx - xorl (%esi,%eax,1),%edx - xorl 4(%esi,%eax,1),%ebp - decl %edi - js .L004x86_break - movb %bl,%al - shrdl $4,%ecx,%ebx - andb $15,%al - shrdl $4,%edx,%ecx - shrdl $4,%ebp,%edx - shrl $4,%ebp - xorl 16(%esp,%eax,4),%ebp - movb (%esp,%edi,1),%al - shlb $4,%al - xorl 8(%esi,%eax,1),%ebx - xorl 12(%esi,%eax,1),%ecx - xorl (%esi,%eax,1),%edx - xorl 4(%esi,%eax,1),%ebp - jmp .L003x86_loop -.align 16 -.L004x86_break: - bswap %ebx - bswap %ecx - bswap %edx - bswap %ebp - movl 112(%esp),%edi - leal 16(%edi),%edi - cmpl 116(%esp),%edi - movl %edi,112(%esp) - jb .L002x86_outer_loop - movl 104(%esp),%edi - movl %ebx,12(%edi) - movl %ecx,8(%edi) - movl %edx,4(%edi) - movl %ebp,(%edi) - addl $84,%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size gcm_ghash_4bit_x86,.-.L_gcm_ghash_4bit_x86_begin .globl gcm_gmult_4bit_mmx .hidden gcm_gmult_4bit_mmx .type gcm_gmult_4bit_mmx,@function @@ -218,10 +12,10 @@ gcm_gmult_4bit_mmx: pushl %edi movl 20(%esp),%edi movl 24(%esp),%esi - call .L005pic_point -.L005pic_point: + call .L000pic_point +.L000pic_point: popl %eax - leal .Lrem_4bit-.L005pic_point(%eax),%eax + leal .Lrem_4bit-.L000pic_point(%eax),%eax movzbl 15(%edi),%ebx xorl %ecx,%ecx movl %ebx,%edx @@ -232,9 +26,9 @@ gcm_gmult_4bit_mmx: movq 8(%esi,%ecx,1),%mm0 movq (%esi,%ecx,1),%mm1 movd %mm0,%ebx - jmp .L006mmx_loop + jmp .L001mmx_loop .align 16 -.L006mmx_loop: +.L001mmx_loop: psrlq $4,%mm0 andl $15,%ebx movq %mm1,%mm2 @@ -248,7 +42,7 @@ gcm_gmult_4bit_mmx: pxor (%esi,%edx,1),%mm1 movl %ecx,%edx pxor %mm2,%mm0 - js .L007mmx_break + js .L002mmx_break shlb $4,%cl andl $15,%ebx psrlq $4,%mm0 @@ -261,9 +55,9 @@ gcm_gmult_4bit_mmx: movd %mm0,%ebx pxor (%esi,%ecx,1),%mm1 pxor %mm2,%mm0 - jmp .L006mmx_loop + jmp .L001mmx_loop .align 16 -.L007mmx_break: +.L002mmx_break: shlb $4,%cl andl $15,%ebx psrlq $4,%mm0 @@ -321,10 +115,10 @@ gcm_ghash_4bit_mmx: movl 28(%esp),%ecx movl 32(%esp),%edx movl %esp,%ebp - call .L008pic_point -.L008pic_point: + call .L003pic_point +.L003pic_point: popl %esi - leal .Lrem_8bit-.L008pic_point(%esi),%esi + leal .Lrem_8bit-.L003pic_point(%esi),%esi subl $544,%esp andl $-64,%esp subl $16,%esp @@ -563,7 +357,7 @@ gcm_ghash_4bit_mmx: movl 8(%eax),%ebx movl 12(%eax),%edx .align 16 -.L009outer: +.L004outer: xorl 12(%ecx),%edx xorl 8(%ecx),%ebx pxor (%ecx),%mm6 @@ -898,7 +692,7 @@ gcm_ghash_4bit_mmx: pshufw $27,%mm6,%mm6 bswap %ebx cmpl 552(%esp),%ecx - jne .L009outer + jne .L004outer movl 544(%esp),%eax movl %edx,12(%eax) movl %ebx,8(%eax) @@ -919,10 +713,10 @@ gcm_init_clmul: .L_gcm_init_clmul_begin: movl 4(%esp),%edx movl 8(%esp),%eax - call .L010pic -.L010pic: + call .L005pic +.L005pic: popl %ecx - leal .Lbswap-.L010pic(%ecx),%ecx + leal .Lbswap-.L005pic(%ecx),%ecx movdqu (%eax),%xmm2 pshufd $78,%xmm2,%xmm2 pshufd $255,%xmm2,%xmm4 @@ -989,10 +783,10 @@ gcm_gmult_clmul: .L_gcm_gmult_clmul_begin: movl 4(%esp),%eax movl 8(%esp),%edx - call .L011pic -.L011pic: + call .L006pic +.L006pic: popl %ecx - leal .Lbswap-.L011pic(%ecx),%ecx + leal .Lbswap-.L006pic(%ecx),%ecx movdqu (%eax),%xmm0 movdqa (%ecx),%xmm5 movups (%edx),%xmm2 @@ -1049,16 +843,16 @@ gcm_ghash_clmul: movl 24(%esp),%edx movl 28(%esp),%esi movl 32(%esp),%ebx - call .L012pic -.L012pic: + call .L007pic +.L007pic: popl %ecx - leal .Lbswap-.L012pic(%ecx),%ecx + leal .Lbswap-.L007pic(%ecx),%ecx movdqu (%eax),%xmm0 movdqa (%ecx),%xmm5 movdqu (%edx),%xmm2 .byte 102,15,56,0,197 subl $16,%ebx - jz .L013odd_tail + jz .L008odd_tail movdqu (%esi),%xmm3 movdqu 16(%esi),%xmm6 .byte 102,15,56,0,221 @@ -1075,10 +869,10 @@ gcm_ghash_clmul: movups 16(%edx),%xmm2 nop subl $32,%ebx - jbe .L014even_tail - jmp .L015mod_loop + jbe .L009even_tail + jmp .L010mod_loop .align 32 -.L015mod_loop: +.L010mod_loop: pshufd $78,%xmm0,%xmm4 movdqa %xmm0,%xmm1 pxor %xmm0,%xmm4 @@ -1133,8 +927,8 @@ gcm_ghash_clmul: .byte 102,15,58,68,221,0 leal 32(%esi),%esi subl $32,%ebx - ja .L015mod_loop -.L014even_tail: + ja .L010mod_loop +.L009even_tail: pshufd $78,%xmm0,%xmm4 movdqa %xmm0,%xmm1 pxor %xmm0,%xmm4 @@ -1173,9 +967,9 @@ gcm_ghash_clmul: psrlq $1,%xmm0 pxor %xmm1,%xmm0 testl %ebx,%ebx - jnz .L016done + jnz .L011done movups (%edx),%xmm2 -.L013odd_tail: +.L008odd_tail: movdqu (%esi),%xmm3 .byte 102,15,56,0,221 pxor %xmm3,%xmm0 @@ -1214,7 +1008,7 @@ gcm_ghash_clmul: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 -.L016done: +.L011done: .byte 102,15,56,0,197 movdqu %xmm0,(%eax) popl %edi diff --git a/packager/third_party/boringssl/linux-x86/crypto/md5/md5-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S similarity index 99% rename from packager/third_party/boringssl/linux-x86/crypto/md5/md5-586.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S index 734b941a82..7237f95bec 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/md5/md5-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "src/crypto/md5/asm/md5-586.S" .text .globl md5_block_asm_data_order .hidden md5_block_asm_data_order diff --git a/packager/third_party/boringssl/linux-x86/crypto/sha/sha1-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S similarity index 99% rename from packager/third_party/boringssl/linux-x86/crypto/sha/sha1-586.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S index 58d0bc1277..2c022ec4af 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/sha/sha1-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "sha1-586.S" .text .globl sha1_block_data_order .hidden sha1_block_data_order diff --git a/packager/third_party/boringssl/linux-x86/crypto/sha/sha256-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S similarity index 99% rename from packager/third_party/boringssl/linux-x86/crypto/sha/sha256-586.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S index 38acbd8374..984758f3b2 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/sha/sha256-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "sha512-586.S" .text .globl sha256_block_data_order .hidden sha256_block_data_order diff --git a/packager/third_party/boringssl/linux-x86/crypto/sha/sha512-586.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S similarity index 99% rename from packager/third_party/boringssl/linux-x86/crypto/sha/sha512-586.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S index a9284000b3..3617ce48b4 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/sha/sha512-586.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "sha512-586.S" .text .globl sha512_block_data_order .hidden sha512_block_data_order diff --git a/packager/third_party/boringssl/linux-x86/crypto/aes/vpaes-x86.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S similarity index 99% rename from packager/third_party/boringssl/linux-x86/crypto/aes/vpaes-x86.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S index 9aede39484..0417b7e353 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/aes/vpaes-x86.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "vpaes-x86.S" .text .align 64 .L_vpaes_consts: diff --git a/packager/third_party/boringssl/linux-x86/crypto/bn/x86-mont.S b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S similarity index 86% rename from packager/third_party/boringssl/linux-x86/crypto/bn/x86-mont.S rename to packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S index 1569b2cff1..3fb668826b 100644 --- a/packager/third_party/boringssl/linux-x86/crypto/bn/x86-mont.S +++ b/packager/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "src/crypto/bn/asm/x86-mont.S" .text .globl bn_mul_mont .hidden bn_mul_mont @@ -17,39 +16,54 @@ bn_mul_mont: jl .L000just_leave leal 20(%esp),%esi leal 24(%esp),%edx - movl %esp,%ebp addl $2,%edi negl %edi - leal -32(%esp,%edi,4),%esp + leal -32(%esp,%edi,4),%ebp negl %edi - movl %esp,%eax + movl %ebp,%eax subl %edx,%eax andl $2047,%eax - subl %eax,%esp - xorl %esp,%edx + subl %eax,%ebp + xorl %ebp,%edx andl $2048,%edx xorl $2048,%edx - subl %edx,%esp - andl $-64,%esp + subl %edx,%ebp + andl $-64,%ebp + movl %esp,%eax + subl %ebp,%eax + andl $-4096,%eax + movl %esp,%edx + leal (%ebp,%eax,1),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja .L001page_walk + jmp .L002page_walk_done +.align 16 +.L001page_walk: + leal -4096(%esp),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja .L001page_walk +.L002page_walk_done: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx - movl 12(%esi),%edx + movl 12(%esi),%ebp movl 16(%esi),%esi movl (%esi),%esi movl %eax,4(%esp) movl %ebx,8(%esp) movl %ecx,12(%esp) - movl %edx,16(%esp) + movl %ebp,16(%esp) movl %esi,20(%esp) leal -3(%edi),%ebx - movl %ebp,24(%esp) - call .L001PIC_me_up -.L001PIC_me_up: + movl %edx,24(%esp) + call .L003PIC_me_up +.L003PIC_me_up: popl %eax - leal OPENSSL_ia32cap_P-.L001PIC_me_up(%eax),%eax + leal OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax btl $26,(%eax) - jnc .L002non_sse2 + jnc .L004non_sse2 movl $-1,%eax movd %eax,%mm7 movl 8(%esp),%esi @@ -73,7 +87,7 @@ bn_mul_mont: psrlq $32,%mm3 incl %ecx .align 16 -.L0031st: +.L0051st: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -88,7 +102,7 @@ bn_mul_mont: psrlq $32,%mm3 leal 1(%ecx),%ecx cmpl %ebx,%ecx - jl .L0031st + jl .L0051st pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -102,7 +116,7 @@ bn_mul_mont: paddq %mm2,%mm3 movq %mm3,32(%esp,%ebx,4) incl %edx -.L004outer: +.L006outer: xorl %ecx,%ecx movd (%edi,%edx,4),%mm4 movd (%esi),%mm5 @@ -124,7 +138,7 @@ bn_mul_mont: paddq %mm6,%mm2 incl %ecx decl %ebx -.L005inner: +.L007inner: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -141,7 +155,7 @@ bn_mul_mont: paddq %mm6,%mm2 decl %ebx leal 1(%ecx),%ecx - jnz .L005inner + jnz .L007inner movl %ecx,%ebx pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 @@ -159,11 +173,11 @@ bn_mul_mont: movq %mm3,32(%esp,%ebx,4) leal 1(%edx),%edx cmpl %ebx,%edx - jle .L004outer + jle .L006outer emms - jmp .L006common_tail + jmp .L008common_tail .align 16 -.L002non_sse2: +.L004non_sse2: movl 8(%esp),%esi leal 1(%ebx),%ebp movl 12(%esp),%edi @@ -174,12 +188,12 @@ bn_mul_mont: leal 4(%edi,%ebx,4),%eax orl %edx,%ebp movl (%edi),%edi - jz .L007bn_sqr_mont + jz .L009bn_sqr_mont movl %eax,28(%esp) movl (%esi),%eax xorl %edx,%edx .align 16 -.L008mull: +.L010mull: movl %edx,%ebp mull %edi addl %eax,%ebp @@ -188,7 +202,7 @@ bn_mul_mont: movl (%esi,%ecx,4),%eax cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl .L008mull + jl .L010mull movl %edx,%ebp mull %edi movl 20(%esp),%edi @@ -206,9 +220,9 @@ bn_mul_mont: movl 4(%esi),%eax adcl $0,%edx incl %ecx - jmp .L0092ndmadd + jmp .L0112ndmadd .align 16 -.L0101stmadd: +.L0121stmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -219,7 +233,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl .L0101stmadd + jl .L0121stmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%eax @@ -242,7 +256,7 @@ bn_mul_mont: adcl $0,%edx movl $1,%ecx .align 16 -.L0092ndmadd: +.L0112ndmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -253,7 +267,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl .L0092ndmadd + jl .L0112ndmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -269,16 +283,16 @@ bn_mul_mont: movl %edx,32(%esp,%ebx,4) cmpl 28(%esp),%ecx movl %eax,36(%esp,%ebx,4) - je .L006common_tail + je .L008common_tail movl (%ecx),%edi movl 8(%esp),%esi movl %ecx,12(%esp) xorl %ecx,%ecx xorl %edx,%edx movl (%esi),%eax - jmp .L0101stmadd + jmp .L0121stmadd .align 16 -.L007bn_sqr_mont: +.L009bn_sqr_mont: movl %ebx,(%esp) movl %ecx,12(%esp) movl %edi,%eax @@ -289,7 +303,7 @@ bn_mul_mont: andl $1,%ebx incl %ecx .align 16 -.L011sqr: +.L013sqr: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -301,7 +315,7 @@ bn_mul_mont: cmpl (%esp),%ecx movl %eax,%ebx movl %ebp,28(%esp,%ecx,4) - jl .L011sqr + jl .L013sqr movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -325,7 +339,7 @@ bn_mul_mont: movl 4(%esi),%eax movl $1,%ecx .align 16 -.L0123rdmadd: +.L0143rdmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -344,7 +358,7 @@ bn_mul_mont: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl .L0123rdmadd + jl .L0143rdmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -360,7 +374,7 @@ bn_mul_mont: movl %edx,32(%esp,%ebx,4) cmpl %ebx,%ecx movl %eax,36(%esp,%ebx,4) - je .L006common_tail + je .L008common_tail movl 4(%esi,%ecx,4),%edi leal 1(%ecx),%ecx movl %edi,%eax @@ -372,12 +386,12 @@ bn_mul_mont: xorl %ebp,%ebp cmpl %ebx,%ecx leal 1(%ecx),%ecx - je .L013sqrlast + je .L015sqrlast movl %edx,%ebx shrl $1,%edx andl $1,%ebx .align 16 -.L014sqradd: +.L016sqradd: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -393,13 +407,13 @@ bn_mul_mont: cmpl (%esp),%ecx movl %ebp,28(%esp,%ecx,4) movl %eax,%ebx - jle .L014sqradd + jle .L016sqradd movl %edx,%ebp addl %edx,%edx shrl $31,%ebp addl %ebx,%edx adcl $0,%ebp -.L013sqrlast: +.L015sqrlast: movl 20(%esp),%edi movl 16(%esp),%esi imull 32(%esp),%edi @@ -414,9 +428,9 @@ bn_mul_mont: adcl $0,%edx movl $1,%ecx movl 4(%esi),%eax - jmp .L0123rdmadd + jmp .L0143rdmadd .align 16 -.L006common_tail: +.L008common_tail: movl 16(%esp),%ebp movl 4(%esp),%edi leal 32(%esp),%esi @@ -424,25 +438,26 @@ bn_mul_mont: movl %ebx,%ecx xorl %edx,%edx .align 16 -.L015sub: +.L017sub: sbbl (%ebp,%edx,4),%eax movl %eax,(%edi,%edx,4) decl %ecx movl 4(%esi,%edx,4),%eax leal 1(%edx),%edx - jge .L015sub + jge .L017sub sbbl $0,%eax + andl %eax,%esi + notl %eax + movl %edi,%ebp + andl %eax,%ebp + orl %ebp,%esi .align 16 -.L016copy: - movl (%esi,%ebx,4),%edx - movl (%edi,%ebx,4),%ebp - xorl %ebp,%edx - andl %eax,%edx - xorl %ebp,%edx - movl %ecx,(%esi,%ebx,4) - movl %edx,(%edi,%ebx,4) +.L018copy: + movl (%esi,%ebx,4),%eax + movl %eax,(%edi,%ebx,4) + movl %ecx,32(%esp,%ebx,4) decl %ebx - jge .L016copy + jge .L018copy movl 24(%esp),%esp movl $1,%eax .L000just_leave: diff --git a/packager/third_party/boringssl/linux-x86/crypto/rc4/rc4-586.S b/packager/third_party/boringssl/linux-x86/crypto/rc4/rc4-586.S deleted file mode 100644 index d245589eca..0000000000 --- a/packager/third_party/boringssl/linux-x86/crypto/rc4/rc4-586.S +++ /dev/null @@ -1,350 +0,0 @@ -#if defined(__i386__) -.file "rc4-586.S" -.text -.globl asm_RC4 -.hidden asm_RC4 -.type asm_RC4,@function -.align 16 -asm_RC4: -.L_asm_RC4_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%edi - movl 24(%esp),%edx - movl 28(%esp),%esi - movl 32(%esp),%ebp - xorl %eax,%eax - xorl %ebx,%ebx - cmpl $0,%edx - je .L000abort - movb (%edi),%al - movb 4(%edi),%bl - addl $8,%edi - leal (%esi,%edx,1),%ecx - subl %esi,%ebp - movl %ecx,24(%esp) - incb %al - cmpl $-1,256(%edi) - je .L001RC4_CHAR - movl (%edi,%eax,4),%ecx - andl $-4,%edx - jz .L002loop1 - movl %ebp,32(%esp) - testl $-8,%edx - jz .L003go4loop4 - call .L004PIC_me_up -.L004PIC_me_up: - popl %ebp - leal OPENSSL_ia32cap_P-.L004PIC_me_up(%ebp),%ebp - btl $26,(%ebp) - jnc .L003go4loop4 - movl 32(%esp),%ebp - andl $-8,%edx - leal -8(%esi,%edx,1),%edx - movl %edx,-4(%edi) - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - movq (%esi),%mm0 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm2 - jmp .L005loop_mmx_enter -.align 16 -.L006loop_mmx: - addb %cl,%bl - psllq $56,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movq (%esi),%mm0 - movq %mm2,-8(%ebp,%esi,1) - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm2 -.L005loop_mmx_enter: - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm0,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $8,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $16,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $24,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $32,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $40,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $48,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - movl %ebx,%edx - xorl %ebx,%ebx - movb %dl,%bl - cmpl -4(%edi),%esi - leal 8(%esi),%esi - jb .L006loop_mmx - psllq $56,%mm1 - pxor %mm1,%mm2 - movq %mm2,-8(%ebp,%esi,1) - emms - cmpl 24(%esp),%esi - je .L007done - jmp .L002loop1 -.align 16 -.L003go4loop4: - leal -4(%esi,%edx,1),%edx - movl %edx,28(%esp) -.L008loop4: - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - addl %ecx,%edx - incb %al - andl $255,%edx - movl (%edi,%eax,4),%ecx - movl (%edi,%edx,4),%ebp - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - addl %ecx,%edx - incb %al - andl $255,%edx - rorl $8,%ebp - movl (%edi,%eax,4),%ecx - orl (%edi,%edx,4),%ebp - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - addl %ecx,%edx - incb %al - andl $255,%edx - rorl $8,%ebp - movl (%edi,%eax,4),%ecx - orl (%edi,%edx,4),%ebp - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - addl %ecx,%edx - incb %al - andl $255,%edx - rorl $8,%ebp - movl 32(%esp),%ecx - orl (%edi,%edx,4),%ebp - rorl $8,%ebp - xorl (%esi),%ebp - cmpl 28(%esp),%esi - movl %ebp,(%ecx,%esi,1) - leal 4(%esi),%esi - movl (%edi,%eax,4),%ecx - jb .L008loop4 - cmpl 24(%esp),%esi - je .L007done - movl 32(%esp),%ebp -.align 16 -.L002loop1: - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - addl %ecx,%edx - incb %al - andl $255,%edx - movl (%edi,%edx,4),%edx - xorb (%esi),%dl - leal 1(%esi),%esi - movl (%edi,%eax,4),%ecx - cmpl 24(%esp),%esi - movb %dl,-1(%ebp,%esi,1) - jb .L002loop1 - jmp .L007done -.align 16 -.L001RC4_CHAR: - movzbl (%edi,%eax,1),%ecx -.L009cloop1: - addb %cl,%bl - movzbl (%edi,%ebx,1),%edx - movb %cl,(%edi,%ebx,1) - movb %dl,(%edi,%eax,1) - addb %cl,%dl - movzbl (%edi,%edx,1),%edx - addb $1,%al - xorb (%esi),%dl - leal 1(%esi),%esi - movzbl (%edi,%eax,1),%ecx - cmpl 24(%esp),%esi - movb %dl,-1(%ebp,%esi,1) - jb .L009cloop1 -.L007done: - decb %al - movl %ebx,-4(%edi) - movb %al,-8(%edi) -.L000abort: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size asm_RC4,.-.L_asm_RC4_begin -.globl asm_RC4_set_key -.hidden asm_RC4_set_key -.type asm_RC4_set_key,@function -.align 16 -asm_RC4_set_key: -.L_asm_RC4_set_key_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%edi - movl 24(%esp),%ebp - movl 28(%esp),%esi - call .L010PIC_me_up -.L010PIC_me_up: - popl %edx - leal OPENSSL_ia32cap_P-.L010PIC_me_up(%edx),%edx - leal 8(%edi),%edi - leal (%esi,%ebp,1),%esi - negl %ebp - xorl %eax,%eax - movl %ebp,-4(%edi) - btl $20,(%edx) - jc .L011c1stloop -.align 16 -.L012w1stloop: - movl %eax,(%edi,%eax,4) - addb $1,%al - jnc .L012w1stloop - xorl %ecx,%ecx - xorl %edx,%edx -.align 16 -.L013w2ndloop: - movl (%edi,%ecx,4),%eax - addb (%esi,%ebp,1),%dl - addb %al,%dl - addl $1,%ebp - movl (%edi,%edx,4),%ebx - jnz .L014wnowrap - movl -4(%edi),%ebp -.L014wnowrap: - movl %eax,(%edi,%edx,4) - movl %ebx,(%edi,%ecx,4) - addb $1,%cl - jnc .L013w2ndloop - jmp .L015exit -.align 16 -.L011c1stloop: - movb %al,(%edi,%eax,1) - addb $1,%al - jnc .L011c1stloop - xorl %ecx,%ecx - xorl %edx,%edx - xorl %ebx,%ebx -.align 16 -.L016c2ndloop: - movb (%edi,%ecx,1),%al - addb (%esi,%ebp,1),%dl - addb %al,%dl - addl $1,%ebp - movb (%edi,%edx,1),%bl - jnz .L017cnowrap - movl -4(%edi),%ebp -.L017cnowrap: - movb %al,(%edi,%edx,1) - movb %bl,(%edi,%ecx,1) - addb $1,%cl - jnc .L016c2ndloop - movl $-1,256(%edi) -.L015exit: - xorl %eax,%eax - movl %eax,-8(%edi) - movl %eax,-4(%edi) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size asm_RC4_set_key,.-.L_asm_RC4_set_key_begin -#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-avx2.S b/packager/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-avx2.S deleted file mode 100644 index cd334d95a6..0000000000 --- a/packager/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-avx2.S +++ /dev/null @@ -1,34 +0,0 @@ -#if defined(__x86_64__) -.text - -.globl rsaz_avx2_eligible -.hidden rsaz_avx2_eligible -.type rsaz_avx2_eligible,@function -rsaz_avx2_eligible: - xorl %eax,%eax - .byte 0xf3,0xc3 -.size rsaz_avx2_eligible,.-rsaz_avx2_eligible - -.globl rsaz_1024_sqr_avx2 -.hidden rsaz_1024_sqr_avx2 -.globl rsaz_1024_mul_avx2 -.hidden rsaz_1024_mul_avx2 -.globl rsaz_1024_norm2red_avx2 -.hidden rsaz_1024_norm2red_avx2 -.globl rsaz_1024_red2norm_avx2 -.hidden rsaz_1024_red2norm_avx2 -.globl rsaz_1024_scatter5_avx2 -.hidden rsaz_1024_scatter5_avx2 -.globl rsaz_1024_gather5_avx2 -.hidden rsaz_1024_gather5_avx2 -.type rsaz_1024_sqr_avx2,@function -rsaz_1024_sqr_avx2: -rsaz_1024_mul_avx2: -rsaz_1024_norm2red_avx2: -rsaz_1024_red2norm_avx2: -rsaz_1024_scatter5_avx2: -rsaz_1024_gather5_avx2: -.byte 0x0f,0x0b - .byte 0xf3,0xc3 -.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 -#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S deleted file mode 100644 index 21531d1c65..0000000000 --- a/packager/third_party/boringssl/linux-x86_64/crypto/bn/rsaz-x86_64.S +++ /dev/null @@ -1,1229 +0,0 @@ -#if defined(__x86_64__) -.text - -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -.globl rsaz_512_sqr -.hidden rsaz_512_sqr -.type rsaz_512_sqr,@function -.align 32 -rsaz_512_sqr: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $128+24,%rsp -.Lsqr_body: - movq %rdx,%rbp - movq (%rsi),%rdx - movq 8(%rsi),%rax - movq %rcx,128(%rsp) - jmp .Loop_sqr - -.align 32 -.Loop_sqr: - movl %r8d,128+8(%rsp) - - movq %rdx,%rbx - mulq %rdx - movq %rax,%r8 - movq 16(%rsi),%rax - movq %rdx,%r9 - - mulq %rbx - addq %rax,%r9 - movq 24(%rsi),%rax - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r10 - movq 32(%rsi),%rax - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r11 - movq 40(%rsi),%rax - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r12 - movq 48(%rsi),%rax - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r13 - movq 56(%rsi),%rax - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - addq %rax,%r14 - movq %rbx,%rax - movq %rdx,%r15 - adcq $0,%r15 - - addq %r8,%r8 - movq %r9,%rcx - adcq %r9,%r9 - - mulq %rax - movq %rax,(%rsp) - addq %rdx,%r8 - adcq $0,%r9 - - movq %r8,8(%rsp) - shrq $63,%rcx - - - movq 8(%rsi),%r8 - movq 16(%rsi),%rax - mulq %r8 - addq %rax,%r10 - movq 24(%rsi),%rax - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r8 - addq %rax,%r11 - movq 32(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r11 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r8 - addq %rax,%r12 - movq 40(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r12 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r8 - addq %rax,%r13 - movq 48(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r13 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r8 - addq %rax,%r14 - movq 56(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r14 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r8 - addq %rax,%r15 - movq %r8,%rax - adcq $0,%rdx - addq %rbx,%r15 - movq %rdx,%r8 - movq %r10,%rdx - adcq $0,%r8 - - addq %rdx,%rdx - leaq (%rcx,%r10,2),%r10 - movq %r11,%rbx - adcq %r11,%r11 - - mulq %rax - addq %rax,%r9 - adcq %rdx,%r10 - adcq $0,%r11 - - movq %r9,16(%rsp) - movq %r10,24(%rsp) - shrq $63,%rbx - - - movq 16(%rsi),%r9 - movq 24(%rsi),%rax - mulq %r9 - addq %rax,%r12 - movq 32(%rsi),%rax - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r9 - addq %rax,%r13 - movq 40(%rsi),%rax - adcq $0,%rdx - addq %rcx,%r13 - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r9 - addq %rax,%r14 - movq 48(%rsi),%rax - adcq $0,%rdx - addq %rcx,%r14 - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r9 - movq %r12,%r10 - leaq (%rbx,%r12,2),%r12 - addq %rax,%r15 - movq 56(%rsi),%rax - adcq $0,%rdx - addq %rcx,%r15 - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r9 - shrq $63,%r10 - addq %rax,%r8 - movq %r9,%rax - adcq $0,%rdx - addq %rcx,%r8 - movq %rdx,%r9 - adcq $0,%r9 - - movq %r13,%rcx - leaq (%r10,%r13,2),%r13 - - mulq %rax - addq %rax,%r11 - adcq %rdx,%r12 - adcq $0,%r13 - - movq %r11,32(%rsp) - movq %r12,40(%rsp) - shrq $63,%rcx - - - movq 24(%rsi),%r10 - movq 32(%rsi),%rax - mulq %r10 - addq %rax,%r14 - movq 40(%rsi),%rax - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r10 - addq %rax,%r15 - movq 48(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r15 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r10 - movq %r14,%r12 - leaq (%rcx,%r14,2),%r14 - addq %rax,%r8 - movq 56(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r8 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r10 - shrq $63,%r12 - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - addq %rbx,%r9 - movq %rdx,%r10 - adcq $0,%r10 - - movq %r15,%rbx - leaq (%r12,%r15,2),%r15 - - mulq %rax - addq %rax,%r13 - adcq %rdx,%r14 - adcq $0,%r15 - - movq %r13,48(%rsp) - movq %r14,56(%rsp) - shrq $63,%rbx - - - movq 32(%rsi),%r11 - movq 40(%rsi),%rax - mulq %r11 - addq %rax,%r8 - movq 48(%rsi),%rax - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r11 - addq %rax,%r9 - movq 56(%rsi),%rax - adcq $0,%rdx - movq %r8,%r12 - leaq (%rbx,%r8,2),%r8 - addq %rcx,%r9 - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r11 - shrq $63,%r12 - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - addq %rcx,%r10 - movq %rdx,%r11 - adcq $0,%r11 - - movq %r9,%rcx - leaq (%r12,%r9,2),%r9 - - mulq %rax - addq %rax,%r15 - adcq %rdx,%r8 - adcq $0,%r9 - - movq %r15,64(%rsp) - movq %r8,72(%rsp) - shrq $63,%rcx - - - movq 40(%rsi),%r12 - movq 48(%rsi),%rax - mulq %r12 - addq %rax,%r10 - movq 56(%rsi),%rax - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r12 - addq %rax,%r11 - movq %r12,%rax - movq %r10,%r15 - leaq (%rcx,%r10,2),%r10 - adcq $0,%rdx - shrq $63,%r15 - addq %rbx,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - movq %r11,%rbx - leaq (%r15,%r11,2),%r11 - - mulq %rax - addq %rax,%r9 - adcq %rdx,%r10 - adcq $0,%r11 - - movq %r9,80(%rsp) - movq %r10,88(%rsp) - - - movq 48(%rsi),%r13 - movq 56(%rsi),%rax - mulq %r13 - addq %rax,%r12 - movq %r13,%rax - movq %rdx,%r13 - adcq $0,%r13 - - xorq %r14,%r14 - shlq $1,%rbx - adcq %r12,%r12 - adcq %r13,%r13 - adcq %r14,%r14 - - mulq %rax - addq %rax,%r11 - adcq %rdx,%r12 - adcq $0,%r13 - - movq %r11,96(%rsp) - movq %r12,104(%rsp) - - - movq 56(%rsi),%rax - mulq %rax - addq %rax,%r13 - adcq $0,%rdx - - addq %rdx,%r14 - - movq %r13,112(%rsp) - movq %r14,120(%rsp) - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reduce - - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - movq %r8,%rdx - movq %r9,%rax - movl 128+8(%rsp),%r8d - movq %rdi,%rsi - - decl %r8d - jnz .Loop_sqr - - leaq 128+24+48(%rsp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lsqr_epilogue: - .byte 0xf3,0xc3 -.size rsaz_512_sqr,.-rsaz_512_sqr -.globl rsaz_512_mul -.hidden rsaz_512_mul -.type rsaz_512_mul,@function -.align 32 -rsaz_512_mul: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $128+24,%rsp -.Lmul_body: -.byte 102,72,15,110,199 -.byte 102,72,15,110,201 - movq %r8,128(%rsp) - movq (%rdx),%rbx - movq %rdx,%rbp - call __rsaz_512_mul - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reduce - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - leaq 128+24+48(%rsp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lmul_epilogue: - .byte 0xf3,0xc3 -.size rsaz_512_mul,.-rsaz_512_mul -.globl rsaz_512_mul_gather4 -.hidden rsaz_512_mul_gather4 -.type rsaz_512_mul_gather4,@function -.align 32 -rsaz_512_mul_gather4: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $152,%rsp -.Lmul_gather4_body: - movd %r9d,%xmm8 - movdqa .Linc+16(%rip),%xmm1 - movdqa .Linc(%rip),%xmm0 - - pshufd $0,%xmm8,%xmm8 - movdqa %xmm1,%xmm7 - movdqa %xmm1,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm8,%xmm0 - movdqa %xmm7,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm8,%xmm1 - movdqa %xmm7,%xmm4 - paddd %xmm2,%xmm3 - pcmpeqd %xmm8,%xmm2 - movdqa %xmm7,%xmm5 - paddd %xmm3,%xmm4 - pcmpeqd %xmm8,%xmm3 - movdqa %xmm7,%xmm6 - paddd %xmm4,%xmm5 - pcmpeqd %xmm8,%xmm4 - paddd %xmm5,%xmm6 - pcmpeqd %xmm8,%xmm5 - paddd %xmm6,%xmm7 - pcmpeqd %xmm8,%xmm6 - pcmpeqd %xmm8,%xmm7 - - movdqa 0(%rdx),%xmm8 - movdqa 16(%rdx),%xmm9 - movdqa 32(%rdx),%xmm10 - movdqa 48(%rdx),%xmm11 - pand %xmm0,%xmm8 - movdqa 64(%rdx),%xmm12 - pand %xmm1,%xmm9 - movdqa 80(%rdx),%xmm13 - pand %xmm2,%xmm10 - movdqa 96(%rdx),%xmm14 - pand %xmm3,%xmm11 - movdqa 112(%rdx),%xmm15 - leaq 128(%rdx),%rbp - pand %xmm4,%xmm12 - pand %xmm5,%xmm13 - pand %xmm6,%xmm14 - pand %xmm7,%xmm15 - por %xmm10,%xmm8 - por %xmm11,%xmm9 - por %xmm12,%xmm8 - por %xmm13,%xmm9 - por %xmm14,%xmm8 - por %xmm15,%xmm9 - - por %xmm9,%xmm8 - pshufd $0x4e,%xmm8,%xmm9 - por %xmm9,%xmm8 -.byte 102,76,15,126,195 - - movq %r8,128(%rsp) - movq %rdi,128+8(%rsp) - movq %rcx,128+16(%rsp) - - movq (%rsi),%rax - movq 8(%rsi),%rcx - mulq %rbx - movq %rax,(%rsp) - movq %rcx,%rax - movq %rdx,%r8 - - mulq %rbx - addq %rax,%r8 - movq 16(%rsi),%rax - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r9 - movq 24(%rsi),%rax - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r10 - movq 32(%rsi),%rax - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r11 - movq 40(%rsi),%rax - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r12 - movq 48(%rsi),%rax - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r13 - movq 56(%rsi),%rax - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - addq %rax,%r14 - movq (%rsi),%rax - movq %rdx,%r15 - adcq $0,%r15 - - leaq 8(%rsp),%rdi - movl $7,%ecx - jmp .Loop_mul_gather - -.align 32 -.Loop_mul_gather: - movdqa 0(%rbp),%xmm8 - movdqa 16(%rbp),%xmm9 - movdqa 32(%rbp),%xmm10 - movdqa 48(%rbp),%xmm11 - pand %xmm0,%xmm8 - movdqa 64(%rbp),%xmm12 - pand %xmm1,%xmm9 - movdqa 80(%rbp),%xmm13 - pand %xmm2,%xmm10 - movdqa 96(%rbp),%xmm14 - pand %xmm3,%xmm11 - movdqa 112(%rbp),%xmm15 - leaq 128(%rbp),%rbp - pand %xmm4,%xmm12 - pand %xmm5,%xmm13 - pand %xmm6,%xmm14 - pand %xmm7,%xmm15 - por %xmm10,%xmm8 - por %xmm11,%xmm9 - por %xmm12,%xmm8 - por %xmm13,%xmm9 - por %xmm14,%xmm8 - por %xmm15,%xmm9 - - por %xmm9,%xmm8 - pshufd $0x4e,%xmm8,%xmm9 - por %xmm9,%xmm8 -.byte 102,76,15,126,195 - - mulq %rbx - addq %rax,%r8 - movq 8(%rsi),%rax - movq %r8,(%rdi) - movq %rdx,%r8 - adcq $0,%r8 - - mulq %rbx - addq %rax,%r9 - movq 16(%rsi),%rax - adcq $0,%rdx - addq %r9,%r8 - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r10 - movq 24(%rsi),%rax - adcq $0,%rdx - addq %r10,%r9 - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r11 - movq 32(%rsi),%rax - adcq $0,%rdx - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r12 - movq 40(%rsi),%rax - adcq $0,%rdx - addq %r12,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r13 - movq 48(%rsi),%rax - adcq $0,%rdx - addq %r13,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r14 - movq 56(%rsi),%rax - adcq $0,%rdx - addq %r14,%r13 - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - addq %rax,%r15 - movq (%rsi),%rax - adcq $0,%rdx - addq %r15,%r14 - movq %rdx,%r15 - adcq $0,%r15 - - leaq 8(%rdi),%rdi - - decl %ecx - jnz .Loop_mul_gather - - movq %r8,(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - movq 128+8(%rsp),%rdi - movq 128+16(%rsp),%rbp - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reduce - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - leaq 128+24+48(%rsp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lmul_gather4_epilogue: - .byte 0xf3,0xc3 -.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 -.globl rsaz_512_mul_scatter4 -.hidden rsaz_512_mul_scatter4 -.type rsaz_512_mul_scatter4,@function -.align 32 -rsaz_512_mul_scatter4: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - movl %r9d,%r9d - subq $128+24,%rsp -.Lmul_scatter4_body: - leaq (%r8,%r9,8),%r8 -.byte 102,72,15,110,199 -.byte 102,72,15,110,202 -.byte 102,73,15,110,208 - movq %rcx,128(%rsp) - - movq %rdi,%rbp - movq (%rdi),%rbx - call __rsaz_512_mul - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reduce - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 -.byte 102,72,15,126,214 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - movq %r8,0(%rsi) - movq %r9,128(%rsi) - movq %r10,256(%rsi) - movq %r11,384(%rsi) - movq %r12,512(%rsi) - movq %r13,640(%rsi) - movq %r14,768(%rsi) - movq %r15,896(%rsi) - - leaq 128+24+48(%rsp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lmul_scatter4_epilogue: - .byte 0xf3,0xc3 -.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 -.globl rsaz_512_mul_by_one -.hidden rsaz_512_mul_by_one -.type rsaz_512_mul_by_one,@function -.align 32 -rsaz_512_mul_by_one: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $128+24,%rsp -.Lmul_by_one_body: - movq %rdx,%rbp - movq %rcx,128(%rsp) - - movq (%rsi),%r8 - pxor %xmm0,%xmm0 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - movq 56(%rsi),%r15 - - movdqa %xmm0,(%rsp) - movdqa %xmm0,16(%rsp) - movdqa %xmm0,32(%rsp) - movdqa %xmm0,48(%rsp) - movdqa %xmm0,64(%rsp) - movdqa %xmm0,80(%rsp) - movdqa %xmm0,96(%rsp) - call __rsaz_512_reduce - movq %r8,(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - leaq 128+24+48(%rsp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -.Lmul_by_one_epilogue: - .byte 0xf3,0xc3 -.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one -.type __rsaz_512_reduce,@function -.align 32 -__rsaz_512_reduce: - movq %r8,%rbx - imulq 128+8(%rsp),%rbx - movq 0(%rbp),%rax - movl $8,%ecx - jmp .Lreduction_loop - -.align 32 -.Lreduction_loop: - mulq %rbx - movq 8(%rbp),%rax - negq %r8 - movq %rdx,%r8 - adcq $0,%r8 - - mulq %rbx - addq %rax,%r9 - movq 16(%rbp),%rax - adcq $0,%rdx - addq %r9,%r8 - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r10 - movq 24(%rbp),%rax - adcq $0,%rdx - addq %r10,%r9 - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r11 - movq 32(%rbp),%rax - adcq $0,%rdx - addq %r11,%r10 - movq 128+8(%rsp),%rsi - - - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbx - addq %rax,%r12 - movq 40(%rbp),%rax - adcq $0,%rdx - imulq %r8,%rsi - addq %r12,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r13 - movq 48(%rbp),%rax - adcq $0,%rdx - addq %r13,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r14 - movq 56(%rbp),%rax - adcq $0,%rdx - addq %r14,%r13 - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - movq %rsi,%rbx - addq %rax,%r15 - movq 0(%rbp),%rax - adcq $0,%rdx - addq %r15,%r14 - movq %rdx,%r15 - adcq $0,%r15 - - decl %ecx - jne .Lreduction_loop - - .byte 0xf3,0xc3 -.size __rsaz_512_reduce,.-__rsaz_512_reduce -.type __rsaz_512_subtract,@function -.align 32 -__rsaz_512_subtract: - movq %r8,(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - movq 0(%rbp),%r8 - movq 8(%rbp),%r9 - negq %r8 - notq %r9 - andq %rcx,%r8 - movq 16(%rbp),%r10 - andq %rcx,%r9 - notq %r10 - movq 24(%rbp),%r11 - andq %rcx,%r10 - notq %r11 - movq 32(%rbp),%r12 - andq %rcx,%r11 - notq %r12 - movq 40(%rbp),%r13 - andq %rcx,%r12 - notq %r13 - movq 48(%rbp),%r14 - andq %rcx,%r13 - notq %r14 - movq 56(%rbp),%r15 - andq %rcx,%r14 - notq %r15 - andq %rcx,%r15 - - addq (%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - - movq %r8,(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - .byte 0xf3,0xc3 -.size __rsaz_512_subtract,.-__rsaz_512_subtract -.type __rsaz_512_mul,@function -.align 32 -__rsaz_512_mul: - leaq 8(%rsp),%rdi - - movq (%rsi),%rax - mulq %rbx - movq %rax,(%rdi) - movq 8(%rsi),%rax - movq %rdx,%r8 - - mulq %rbx - addq %rax,%r8 - movq 16(%rsi),%rax - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r9 - movq 24(%rsi),%rax - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r10 - movq 32(%rsi),%rax - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r11 - movq 40(%rsi),%rax - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r12 - movq 48(%rsi),%rax - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r13 - movq 56(%rsi),%rax - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - addq %rax,%r14 - movq (%rsi),%rax - movq %rdx,%r15 - adcq $0,%r15 - - leaq 8(%rbp),%rbp - leaq 8(%rdi),%rdi - - movl $7,%ecx - jmp .Loop_mul - -.align 32 -.Loop_mul: - movq (%rbp),%rbx - mulq %rbx - addq %rax,%r8 - movq 8(%rsi),%rax - movq %r8,(%rdi) - movq %rdx,%r8 - adcq $0,%r8 - - mulq %rbx - addq %rax,%r9 - movq 16(%rsi),%rax - adcq $0,%rdx - addq %r9,%r8 - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r10 - movq 24(%rsi),%rax - adcq $0,%rdx - addq %r10,%r9 - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r11 - movq 32(%rsi),%rax - adcq $0,%rdx - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r12 - movq 40(%rsi),%rax - adcq $0,%rdx - addq %r12,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r13 - movq 48(%rsi),%rax - adcq $0,%rdx - addq %r13,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r14 - movq 56(%rsi),%rax - adcq $0,%rdx - addq %r14,%r13 - movq %rdx,%r14 - leaq 8(%rbp),%rbp - adcq $0,%r14 - - mulq %rbx - addq %rax,%r15 - movq (%rsi),%rax - adcq $0,%rdx - addq %r15,%r14 - movq %rdx,%r15 - adcq $0,%r15 - - leaq 8(%rdi),%rdi - - decl %ecx - jnz .Loop_mul - - movq %r8,(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - .byte 0xf3,0xc3 -.size __rsaz_512_mul,.-__rsaz_512_mul -.globl rsaz_512_scatter4 -.hidden rsaz_512_scatter4 -.type rsaz_512_scatter4,@function -.align 16 -rsaz_512_scatter4: - leaq (%rdi,%rdx,8),%rdi - movl $8,%r9d - jmp .Loop_scatter -.align 16 -.Loop_scatter: - movq (%rsi),%rax - leaq 8(%rsi),%rsi - movq %rax,(%rdi) - leaq 128(%rdi),%rdi - decl %r9d - jnz .Loop_scatter - .byte 0xf3,0xc3 -.size rsaz_512_scatter4,.-rsaz_512_scatter4 - -.globl rsaz_512_gather4 -.hidden rsaz_512_gather4 -.type rsaz_512_gather4,@function -.align 16 -rsaz_512_gather4: - movd %edx,%xmm8 - movdqa .Linc+16(%rip),%xmm1 - movdqa .Linc(%rip),%xmm0 - - pshufd $0,%xmm8,%xmm8 - movdqa %xmm1,%xmm7 - movdqa %xmm1,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm8,%xmm0 - movdqa %xmm7,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm8,%xmm1 - movdqa %xmm7,%xmm4 - paddd %xmm2,%xmm3 - pcmpeqd %xmm8,%xmm2 - movdqa %xmm7,%xmm5 - paddd %xmm3,%xmm4 - pcmpeqd %xmm8,%xmm3 - movdqa %xmm7,%xmm6 - paddd %xmm4,%xmm5 - pcmpeqd %xmm8,%xmm4 - paddd %xmm5,%xmm6 - pcmpeqd %xmm8,%xmm5 - paddd %xmm6,%xmm7 - pcmpeqd %xmm8,%xmm6 - pcmpeqd %xmm8,%xmm7 - movl $8,%r9d - jmp .Loop_gather -.align 16 -.Loop_gather: - movdqa 0(%rsi),%xmm8 - movdqa 16(%rsi),%xmm9 - movdqa 32(%rsi),%xmm10 - movdqa 48(%rsi),%xmm11 - pand %xmm0,%xmm8 - movdqa 64(%rsi),%xmm12 - pand %xmm1,%xmm9 - movdqa 80(%rsi),%xmm13 - pand %xmm2,%xmm10 - movdqa 96(%rsi),%xmm14 - pand %xmm3,%xmm11 - movdqa 112(%rsi),%xmm15 - leaq 128(%rsi),%rsi - pand %xmm4,%xmm12 - pand %xmm5,%xmm13 - pand %xmm6,%xmm14 - pand %xmm7,%xmm15 - por %xmm10,%xmm8 - por %xmm11,%xmm9 - por %xmm12,%xmm8 - por %xmm13,%xmm9 - por %xmm14,%xmm8 - por %xmm15,%xmm9 - - por %xmm9,%xmm8 - pshufd $0x4e,%xmm8,%xmm9 - por %xmm9,%xmm8 - movq %xmm8,(%rdi) - leaq 8(%rdi),%rdi - decl %r9d - jnz .Loop_gather - .byte 0xf3,0xc3 -.LSEH_end_rsaz_512_gather4: -.size rsaz_512_gather4,.-rsaz_512_gather4 - -.align 64 -.Linc: -.long 0,0, 1,1 -.long 2,2, 2,2 -#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S index e994940a3f..62dc77999a 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .extern OPENSSL_ia32cap_P @@ -23,6 +23,15 @@ .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe .Lsigma: .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 +.align 64 +.Lzeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +.Lfourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +.Lincz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lsixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .globl ChaCha20_ctr32 .hidden ChaCha20_ctr32 @@ -42,6 +51,7 @@ ChaCha20_ctr32: pushq %r14 pushq %r15 subq $64+24,%rsp +.Lctr32_body: movdqu (%rcx),%xmm1 @@ -279,13 +289,14 @@ ChaCha20_ctr32: jnz .Loop_tail .Ldone: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq 64+24+48(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lno_data: .byte 0xf3,0xc3 .size ChaCha20_ctr32,.-ChaCha20_ctr32 @@ -293,18 +304,12 @@ ChaCha20_ctr32: .align 32 ChaCha20_ssse3: .LChaCha20_ssse3: + movq %rsp,%r9 cmpq $128,%rdx ja .LChaCha20_4x .Ldo_sse3_after_all: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $64+24,%rsp + subq $64+8,%rsp movdqa .Lsigma(%rip),%xmm0 movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 @@ -316,7 +321,7 @@ ChaCha20_ssse3: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - movl $10,%ebp + movq $10,%r8 jmp .Loop_ssse3 .align 32 @@ -326,7 +331,7 @@ ChaCha20_ssse3: movdqa 16(%rsp),%xmm1 movdqa 32(%rsp),%xmm2 paddd 48(%rsp),%xmm3 - movl $10,%ebp + movq $10,%r8 movdqa %xmm3,48(%rsp) jmp .Loop_ssse3 @@ -375,7 +380,7 @@ ChaCha20_ssse3: pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm1,%xmm1 pshufd $57,%xmm3,%xmm3 - decl %ebp + decq %r8 jnz .Loop_ssse3 paddd 0(%rsp),%xmm0 paddd 16(%rsp),%xmm1 @@ -412,31 +417,27 @@ ChaCha20_ssse3: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - xorq %rbx,%rbx + xorq %r8,%r8 .Loop_tail_ssse3: - movzbl (%rsi,%rbx,1),%eax - movzbl (%rsp,%rbx,1),%ecx - leaq 1(%rbx),%rbx + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 xorl %ecx,%eax - movb %al,-1(%rdi,%rbx,1) + movb %al,-1(%rdi,%r8,1) decq %rdx jnz .Loop_tail_ssse3 .Ldone_ssse3: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq (%r9),%rsp +.Lssse3_epilogue: .byte 0xf3,0xc3 .size ChaCha20_ssse3,.-ChaCha20_ssse3 .type ChaCha20_4x,@function .align 32 ChaCha20_4x: .LChaCha20_4x: + movq %rsp,%r9 movq %r10,%r11 shrq $32,%r10 testq $32,%r10 @@ -449,8 +450,7 @@ ChaCha20_4x: je .Ldo_sse3_after_all .Lproceed4x: - leaq -120(%rsp),%r11 - subq $0x148+0,%rsp + subq $0x140+8,%rsp movdqa .Lsigma(%rip),%xmm11 movdqu (%rcx),%xmm15 movdqu 16(%rcx),%xmm7 @@ -977,18 +977,18 @@ ChaCha20_4x: jnz .Loop_tail4x .Ldone4x: - addq $0x148+0,%rsp + leaq (%r9),%rsp +.L4x_epilogue: .byte 0xf3,0xc3 .size ChaCha20_4x,.-ChaCha20_4x .type ChaCha20_8x,@function .align 32 ChaCha20_8x: .LChaCha20_8x: - movq %rsp,%r10 + movq %rsp,%r9 subq $0x280+8,%rsp andq $-32,%rsp vzeroupper - movq %r10,640(%rsp) @@ -1579,7 +1579,8 @@ ChaCha20_8x: .Ldone8x: vzeroall - movq 640(%rsp),%rsp + leaq (%r9),%rsp +.L8x_epilogue: .byte 0xf3,0xc3 .size ChaCha20_8x,.-ChaCha20_8x #endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S new file mode 100644 index 0000000000..42e25f4817 --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S @@ -0,0 +1,3066 @@ +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +.data + +.align 16 +one: +.quad 1,0 +two: +.quad 2,0 +three: +.quad 3,0 +four: +.quad 4,0 +five: +.quad 5,0 +six: +.quad 6,0 +seven: +.quad 7,0 +eight: +.quad 8,0 + +OR_MASK: +.long 0x00000000,0x00000000,0x00000000,0x80000000 +poly: +.quad 0x1, 0xc200000000000000 +mask: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +con1: +.long 1,1,1,1 +con2: +.long 0x1b,0x1b,0x1b,0x1b +con3: +.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 +and_mask: +.long 0,0xffffffff, 0xffffffff, 0xffffffff +.text +.type GFMUL,@function +.align 16 +GFMUL: +.cfi_startproc + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm3,%xmm5,%xmm5 + + vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 + vpshufd $78,%xmm2,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + + vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 + vpshufd $78,%xmm2,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + + vpxor %xmm5,%xmm2,%xmm0 + .byte 0xf3,0xc3 +.cfi_endproc +.size GFMUL, .-GFMUL +.globl aesgcmsiv_htable_init +.hidden aesgcmsiv_htable_init +.type aesgcmsiv_htable_init,@function +.align 16 +aesgcmsiv_htable_init: +.cfi_startproc + vmovdqa (%rsi),%xmm0 + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm0,(%rdi) + call GFMUL + vmovdqa %xmm0,16(%rdi) + call GFMUL + vmovdqa %xmm0,32(%rdi) + call GFMUL + vmovdqa %xmm0,48(%rdi) + call GFMUL + vmovdqa %xmm0,64(%rdi) + call GFMUL + vmovdqa %xmm0,80(%rdi) + call GFMUL + vmovdqa %xmm0,96(%rdi) + call GFMUL + vmovdqa %xmm0,112(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init +.globl aesgcmsiv_htable6_init +.hidden aesgcmsiv_htable6_init +.type aesgcmsiv_htable6_init,@function +.align 16 +aesgcmsiv_htable6_init: +.cfi_startproc + vmovdqa (%rsi),%xmm0 + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm0,(%rdi) + call GFMUL + vmovdqa %xmm0,16(%rdi) + call GFMUL + vmovdqa %xmm0,32(%rdi) + call GFMUL + vmovdqa %xmm0,48(%rdi) + call GFMUL + vmovdqa %xmm0,64(%rdi) + call GFMUL + vmovdqa %xmm0,80(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init +.globl aesgcmsiv_htable_polyval +.hidden aesgcmsiv_htable_polyval +.type aesgcmsiv_htable_polyval,@function +.align 16 +aesgcmsiv_htable_polyval: +.cfi_startproc + testq %rdx,%rdx + jnz .Lhtable_polyval_start + .byte 0xf3,0xc3 + +.Lhtable_polyval_start: + vzeroall + + + + movq %rdx,%r11 + andq $127,%r11 + + jz .Lhtable_polyval_no_prefix + + vpxor %xmm9,%xmm9,%xmm9 + vmovdqa (%rcx),%xmm1 + subq %r11,%rdx + + subq $16,%r11 + + + vmovdqu (%rsi),%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5 + vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3 + vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4 + vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + leaq 16(%rsi),%rsi + testq %r11,%r11 + jnz .Lhtable_polyval_prefix_loop + jmp .Lhtable_polyval_prefix_complete + + +.align 64 +.Lhtable_polyval_prefix_loop: + subq $16,%r11 + + vmovdqu (%rsi),%xmm0 + + vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + testq %r11,%r11 + + leaq 16(%rsi),%rsi + + jnz .Lhtable_polyval_prefix_loop + +.Lhtable_polyval_prefix_complete: + vpsrldq $8,%xmm5,%xmm6 + vpslldq $8,%xmm5,%xmm5 + + vpxor %xmm6,%xmm4,%xmm9 + vpxor %xmm5,%xmm3,%xmm1 + + jmp .Lhtable_polyval_main_loop + +.Lhtable_polyval_no_prefix: + + + + + vpxor %xmm1,%xmm1,%xmm1 + vmovdqa (%rcx),%xmm9 + +.align 64 +.Lhtable_polyval_main_loop: + subq $0x80,%rdx + jb .Lhtable_polyval_out + + vmovdqu 112(%rsi),%xmm0 + + vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5 + vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3 + vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4 + vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 96(%rsi),%xmm0 + vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + + vmovdqu 80(%rsi),%xmm0 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 + vpalignr $8,%xmm1,%xmm1,%xmm1 + + vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm7,%xmm1,%xmm1 + + vmovdqu 64(%rsi),%xmm0 + + vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 48(%rsi),%xmm0 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 + vpalignr $8,%xmm1,%xmm1,%xmm1 + + vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm7,%xmm1,%xmm1 + + vmovdqu 32(%rsi),%xmm0 + + vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm9,%xmm1,%xmm1 + + vmovdqu 16(%rsi),%xmm0 + + vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 0(%rsi),%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpsrldq $8,%xmm5,%xmm6 + vpslldq $8,%xmm5,%xmm5 + + vpxor %xmm6,%xmm4,%xmm9 + vpxor %xmm5,%xmm3,%xmm1 + + leaq 128(%rsi),%rsi + jmp .Lhtable_polyval_main_loop + + + +.Lhtable_polyval_out: + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 + vpalignr $8,%xmm1,%xmm1,%xmm1 + vpxor %xmm6,%xmm1,%xmm1 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 + vpalignr $8,%xmm1,%xmm1,%xmm1 + vpxor %xmm6,%xmm1,%xmm1 + vpxor %xmm9,%xmm1,%xmm1 + + vmovdqu %xmm1,(%rcx) + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval +.globl aesgcmsiv_polyval_horner +.hidden aesgcmsiv_polyval_horner +.type aesgcmsiv_polyval_horner,@function +.align 16 +aesgcmsiv_polyval_horner: +.cfi_startproc + testq %rcx,%rcx + jnz .Lpolyval_horner_start + .byte 0xf3,0xc3 + +.Lpolyval_horner_start: + + + + xorq %r10,%r10 + shlq $4,%rcx + + vmovdqa (%rsi),%xmm1 + vmovdqa (%rdi),%xmm0 + +.Lpolyval_horner_loop: + vpxor (%rdx,%r10,1),%xmm0,%xmm0 + call GFMUL + + addq $16,%r10 + cmpq %r10,%rcx + jne .Lpolyval_horner_loop + + + vmovdqa %xmm0,(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner +.globl aes128gcmsiv_aes_ks +.hidden aes128gcmsiv_aes_ks +.type aes128gcmsiv_aes_ks,@function +.align 16 +aes128gcmsiv_aes_ks: +.cfi_startproc + vmovdqu (%rdi),%xmm1 + vmovdqa %xmm1,(%rsi) + + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + + movq $8,%rax + +.Lks128_loop: + addq $16,%rsi + subq $1,%rax + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + jne .Lks128_loop + + vmovdqa con2(%rip),%xmm0 + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,16(%rsi) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,32(%rsi) + .byte 0xf3,0xc3 +.cfi_endproc +.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks +.globl aes256gcmsiv_aes_ks +.hidden aes256gcmsiv_aes_ks +.type aes256gcmsiv_aes_ks,@function +.align 16 +aes256gcmsiv_aes_ks: +.cfi_startproc + vmovdqu (%rdi),%xmm1 + vmovdqu 16(%rdi),%xmm3 + vmovdqa %xmm1,(%rsi) + vmovdqa %xmm3,16(%rsi) + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + vpxor %xmm14,%xmm14,%xmm14 + movq $6,%rax + +.Lks256_loop: + addq $32,%rsi + subq $1,%rax + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpshufb con3(%rip),%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vmovdqa %xmm3,16(%rsi) + jne .Lks256_loop + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpsllq $32,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,32(%rsi) + .byte 0xf3,0xc3 +.cfi_endproc +.globl aes128gcmsiv_aes_ks_enc_x1 +.hidden aes128gcmsiv_aes_ks_enc_x1 +.type aes128gcmsiv_aes_ks_enc_x1,@function +.align 16 +aes128gcmsiv_aes_ks_enc_x1: +.cfi_startproc + vmovdqa (%rcx),%xmm1 + vmovdqa 0(%rdi),%xmm4 + + vmovdqa %xmm1,(%rdx) + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,16(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,32(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,48(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,64(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,80(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,96(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,112(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,128(%rdx) + + + vmovdqa con2(%rip),%xmm0 + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,144(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenclast %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,160(%rdx) + + + vmovdqa %xmm4,0(%rsi) + .byte 0xf3,0xc3 +.cfi_endproc +.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1 +.globl aes128gcmsiv_kdf +.hidden aes128gcmsiv_kdf +.type aes128gcmsiv_kdf,@function +.align 16 +aes128gcmsiv_kdf: +.cfi_startproc + + + + + vmovdqa (%rdx),%xmm1 + vmovdqa 0(%rdi),%xmm9 + vmovdqa and_mask(%rip),%xmm12 + vmovdqa one(%rip),%xmm13 + vpshufd $0x90,%xmm9,%xmm9 + vpand %xmm12,%xmm9,%xmm9 + vpaddd %xmm13,%xmm9,%xmm10 + vpaddd %xmm13,%xmm10,%xmm11 + vpaddd %xmm13,%xmm11,%xmm12 + + vpxor %xmm1,%xmm9,%xmm9 + vpxor %xmm1,%xmm10,%xmm10 + vpxor %xmm1,%xmm11,%xmm11 + vpxor %xmm1,%xmm12,%xmm12 + + vmovdqa 16(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 32(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 48(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 64(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 80(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 96(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 112(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 128(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 144(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 160(%rdx),%xmm2 + vaesenclast %xmm2,%xmm9,%xmm9 + vaesenclast %xmm2,%xmm10,%xmm10 + vaesenclast %xmm2,%xmm11,%xmm11 + vaesenclast %xmm2,%xmm12,%xmm12 + + + vmovdqa %xmm9,0(%rsi) + vmovdqa %xmm10,16(%rsi) + vmovdqa %xmm11,32(%rsi) + vmovdqa %xmm12,48(%rsi) + .byte 0xf3,0xc3 +.cfi_endproc +.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf +.globl aes128gcmsiv_enc_msg_x4 +.hidden aes128gcmsiv_enc_msg_x4 +.type aes128gcmsiv_enc_msg_x4,@function +.align 16 +aes128gcmsiv_enc_msg_x4: +.cfi_startproc + testq %r8,%r8 + jnz .L128_enc_msg_x4_start + .byte 0xf3,0xc3 + +.L128_enc_msg_x4_start: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 + + shrq $4,%r8 + movq %r8,%r10 + shlq $62,%r10 + shrq $62,%r10 + + + vmovdqa (%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + + vmovdqu four(%rip),%xmm4 + vmovdqa %xmm15,%xmm0 + vpaddd one(%rip),%xmm15,%xmm1 + vpaddd two(%rip),%xmm15,%xmm2 + vpaddd three(%rip),%xmm15,%xmm3 + + shrq $2,%r8 + je .L128_enc_msg_x4_check_remainder + + subq $64,%rsi + subq $64,%rdi + +.L128_enc_msg_x4_loop1: + addq $64,%rsi + addq $64,%rdi + + vmovdqa %xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vmovdqa %xmm2,%xmm7 + vmovdqa %xmm3,%xmm8 + + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqu 32(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm1,%xmm1 + vmovdqu 48(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm2,%xmm2 + vmovdqu 64(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm3,%xmm3 + + vmovdqu 80(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 96(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 112(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 128(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 144(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm12 + vaesenclast %xmm12,%xmm5,%xmm5 + vaesenclast %xmm12,%xmm6,%xmm6 + vaesenclast %xmm12,%xmm7,%xmm7 + vaesenclast %xmm12,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm5,%xmm5 + vpxor 16(%rdi),%xmm6,%xmm6 + vpxor 32(%rdi),%xmm7,%xmm7 + vpxor 48(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm5,0(%rsi) + vmovdqu %xmm6,16(%rsi) + vmovdqu %xmm7,32(%rsi) + vmovdqu %xmm8,48(%rsi) + + jne .L128_enc_msg_x4_loop1 + + addq $64,%rsi + addq $64,%rdi + +.L128_enc_msg_x4_check_remainder: + cmpq $0,%r10 + je .L128_enc_msg_x4_out + +.L128_enc_msg_x4_loop2: + + + vmovdqa %xmm0,%xmm5 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm5,%xmm5 + vaesenc 16(%rcx),%xmm5,%xmm5 + vaesenc 32(%rcx),%xmm5,%xmm5 + vaesenc 48(%rcx),%xmm5,%xmm5 + vaesenc 64(%rcx),%xmm5,%xmm5 + vaesenc 80(%rcx),%xmm5,%xmm5 + vaesenc 96(%rcx),%xmm5,%xmm5 + vaesenc 112(%rcx),%xmm5,%xmm5 + vaesenc 128(%rcx),%xmm5,%xmm5 + vaesenc 144(%rcx),%xmm5,%xmm5 + vaesenclast 160(%rcx),%xmm5,%xmm5 + + + vpxor (%rdi),%xmm5,%xmm5 + vmovdqu %xmm5,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + subq $1,%r10 + jne .L128_enc_msg_x4_loop2 + +.L128_enc_msg_x4_out: + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + .byte 0xf3,0xc3 +.cfi_endproc +.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4 +.globl aes128gcmsiv_enc_msg_x8 +.hidden aes128gcmsiv_enc_msg_x8 +.type aes128gcmsiv_enc_msg_x8,@function +.align 16 +aes128gcmsiv_enc_msg_x8: +.cfi_startproc + testq %r8,%r8 + jnz .L128_enc_msg_x8_start + .byte 0xf3,0xc3 + +.L128_enc_msg_x8_start: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-32 + movq %rsp,%rbp +.cfi_def_cfa_register rbp + + + subq $128,%rsp + andq $-64,%rsp + + shrq $4,%r8 + movq %r8,%r10 + shlq $61,%r10 + shrq $61,%r10 + + + vmovdqu (%rdx),%xmm1 + vpor OR_MASK(%rip),%xmm1,%xmm1 + + + vpaddd seven(%rip),%xmm1,%xmm0 + vmovdqu %xmm0,(%rsp) + vpaddd one(%rip),%xmm1,%xmm9 + vpaddd two(%rip),%xmm1,%xmm10 + vpaddd three(%rip),%xmm1,%xmm11 + vpaddd four(%rip),%xmm1,%xmm12 + vpaddd five(%rip),%xmm1,%xmm13 + vpaddd six(%rip),%xmm1,%xmm14 + vmovdqa %xmm1,%xmm0 + + shrq $3,%r8 + je .L128_enc_msg_x8_check_remainder + + subq $128,%rsi + subq $128,%rdi + +.L128_enc_msg_x8_loop1: + addq $128,%rsi + addq $128,%rdi + + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm9,%xmm2 + vmovdqa %xmm10,%xmm3 + vmovdqa %xmm11,%xmm4 + vmovdqa %xmm12,%xmm5 + vmovdqa %xmm13,%xmm6 + vmovdqa %xmm14,%xmm7 + + vmovdqu (%rsp),%xmm8 + + vpxor (%rcx),%xmm1,%xmm1 + vpxor (%rcx),%xmm2,%xmm2 + vpxor (%rcx),%xmm3,%xmm3 + vpxor (%rcx),%xmm4,%xmm4 + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu (%rsp),%xmm14 + vpaddd eight(%rip),%xmm14,%xmm14 + vmovdqu %xmm14,(%rsp) + vmovdqu 32(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpsubd one(%rip),%xmm14,%xmm14 + vmovdqu 48(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm0,%xmm0 + vmovdqu 64(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm9,%xmm9 + vmovdqu 80(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm10,%xmm10 + vmovdqu 96(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm11,%xmm11 + vmovdqu 112(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm12,%xmm12 + vmovdqu 128(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm13,%xmm13 + vmovdqu 144(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm15 + vaesenclast %xmm15,%xmm1,%xmm1 + vaesenclast %xmm15,%xmm2,%xmm2 + vaesenclast %xmm15,%xmm3,%xmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vaesenclast %xmm15,%xmm6,%xmm6 + vaesenclast %xmm15,%xmm7,%xmm7 + vaesenclast %xmm15,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm1,%xmm1 + vpxor 16(%rdi),%xmm2,%xmm2 + vpxor 32(%rdi),%xmm3,%xmm3 + vpxor 48(%rdi),%xmm4,%xmm4 + vpxor 64(%rdi),%xmm5,%xmm5 + vpxor 80(%rdi),%xmm6,%xmm6 + vpxor 96(%rdi),%xmm7,%xmm7 + vpxor 112(%rdi),%xmm8,%xmm8 + + decq %r8 + + vmovdqu %xmm1,0(%rsi) + vmovdqu %xmm2,16(%rsi) + vmovdqu %xmm3,32(%rsi) + vmovdqu %xmm4,48(%rsi) + vmovdqu %xmm5,64(%rsi) + vmovdqu %xmm6,80(%rsi) + vmovdqu %xmm7,96(%rsi) + vmovdqu %xmm8,112(%rsi) + + jne .L128_enc_msg_x8_loop1 + + addq $128,%rsi + addq $128,%rdi + +.L128_enc_msg_x8_check_remainder: + cmpq $0,%r10 + je .L128_enc_msg_x8_out + +.L128_enc_msg_x8_loop2: + + + vmovdqa %xmm0,%xmm1 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm1,%xmm1 + vaesenc 16(%rcx),%xmm1,%xmm1 + vaesenc 32(%rcx),%xmm1,%xmm1 + vaesenc 48(%rcx),%xmm1,%xmm1 + vaesenc 64(%rcx),%xmm1,%xmm1 + vaesenc 80(%rcx),%xmm1,%xmm1 + vaesenc 96(%rcx),%xmm1,%xmm1 + vaesenc 112(%rcx),%xmm1,%xmm1 + vaesenc 128(%rcx),%xmm1,%xmm1 + vaesenc 144(%rcx),%xmm1,%xmm1 + vaesenclast 160(%rcx),%xmm1,%xmm1 + + + vpxor (%rdi),%xmm1,%xmm1 + + vmovdqu %xmm1,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + decq %r10 + jne .L128_enc_msg_x8_loop2 + +.L128_enc_msg_x8_out: + movq %rbp,%rsp +.cfi_def_cfa_register %rsp + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + .byte 0xf3,0xc3 +.cfi_endproc +.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8 +.globl aes128gcmsiv_dec +.hidden aes128gcmsiv_dec +.type aes128gcmsiv_dec,@function +.align 16 +aes128gcmsiv_dec: +.cfi_startproc + testq $~15,%r9 + jnz .L128_dec_start + .byte 0xf3,0xc3 + +.L128_dec_start: + vzeroupper + vmovdqa (%rdx),%xmm0 + movq %rdx,%rax + + leaq 32(%rax),%rax + leaq 32(%rcx),%rcx + + + vmovdqu (%rdi,%r9,1),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + andq $~15,%r9 + + + cmpq $96,%r9 + jb .L128_dec_loop2 + + + subq $96,%r9 + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vpxor (%r8),%xmm7,%xmm7 + vpxor (%r8),%xmm8,%xmm8 + vpxor (%r8),%xmm9,%xmm9 + vpxor (%r8),%xmm10,%xmm10 + vpxor (%r8),%xmm11,%xmm11 + vpxor (%r8),%xmm12,%xmm12 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vaesenclast %xmm4,%xmm8,%xmm8 + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm4,%xmm10,%xmm10 + vaesenclast %xmm4,%xmm11,%xmm11 + vaesenclast %xmm4,%xmm12,%xmm12 + + + vpxor 0(%rdi),%xmm7,%xmm7 + vpxor 16(%rdi),%xmm8,%xmm8 + vpxor 32(%rdi),%xmm9,%xmm9 + vpxor 48(%rdi),%xmm10,%xmm10 + vpxor 64(%rdi),%xmm11,%xmm11 + vpxor 80(%rdi),%xmm12,%xmm12 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + addq $96,%rdi + addq $96,%rsi + jmp .L128_dec_loop1 + + +.align 64 +.L128_dec_loop1: + cmpq $96,%r9 + jb .L128_dec_finish_96 + subq $96,%r9 + + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vmovdqa (%r8),%xmm4 + vpxor %xmm4,%xmm7,%xmm7 + vpxor %xmm4,%xmm8,%xmm8 + vpxor %xmm4,%xmm9,%xmm9 + vpxor %xmm4,%xmm10,%xmm10 + vpxor %xmm4,%xmm11,%xmm11 + vpxor %xmm4,%xmm12,%xmm12 + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vmovdqa 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm6 + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor 0(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vpxor 16(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm8,%xmm8 + vpxor 32(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm9,%xmm9 + vpxor 48(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm10,%xmm10 + vpxor 64(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm11,%xmm11 + vpxor 80(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm12,%xmm12 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + vpxor %xmm5,%xmm0,%xmm0 + + leaq 96(%rdi),%rdi + leaq 96(%rsi),%rsi + jmp .L128_dec_loop1 + +.L128_dec_finish_96: + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor %xmm5,%xmm0,%xmm0 + +.L128_dec_loop2: + + + + cmpq $16,%r9 + jb .L128_dec_out + subq $16,%r9 + + vmovdqa %xmm15,%xmm2 + vpaddd one(%rip),%xmm15,%xmm15 + + vpxor 0(%r8),%xmm2,%xmm2 + vaesenc 16(%r8),%xmm2,%xmm2 + vaesenc 32(%r8),%xmm2,%xmm2 + vaesenc 48(%r8),%xmm2,%xmm2 + vaesenc 64(%r8),%xmm2,%xmm2 + vaesenc 80(%r8),%xmm2,%xmm2 + vaesenc 96(%r8),%xmm2,%xmm2 + vaesenc 112(%r8),%xmm2,%xmm2 + vaesenc 128(%r8),%xmm2,%xmm2 + vaesenc 144(%r8),%xmm2,%xmm2 + vaesenclast 160(%r8),%xmm2,%xmm2 + vpxor (%rdi),%xmm2,%xmm2 + vmovdqu %xmm2,(%rsi) + addq $16,%rdi + addq $16,%rsi + + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa -32(%rcx),%xmm1 + call GFMUL + + jmp .L128_dec_loop2 + +.L128_dec_out: + vmovdqu %xmm0,(%rdx) + .byte 0xf3,0xc3 +.cfi_endproc +.size aes128gcmsiv_dec, .-aes128gcmsiv_dec +.globl aes128gcmsiv_ecb_enc_block +.hidden aes128gcmsiv_ecb_enc_block +.type aes128gcmsiv_ecb_enc_block,@function +.align 16 +aes128gcmsiv_ecb_enc_block: +.cfi_startproc + vmovdqa (%rdi),%xmm1 + + vpxor (%rdx),%xmm1,%xmm1 + vaesenc 16(%rdx),%xmm1,%xmm1 + vaesenc 32(%rdx),%xmm1,%xmm1 + vaesenc 48(%rdx),%xmm1,%xmm1 + vaesenc 64(%rdx),%xmm1,%xmm1 + vaesenc 80(%rdx),%xmm1,%xmm1 + vaesenc 96(%rdx),%xmm1,%xmm1 + vaesenc 112(%rdx),%xmm1,%xmm1 + vaesenc 128(%rdx),%xmm1,%xmm1 + vaesenc 144(%rdx),%xmm1,%xmm1 + vaesenclast 160(%rdx),%xmm1,%xmm1 + + vmovdqa %xmm1,(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block +.globl aes256gcmsiv_aes_ks_enc_x1 +.hidden aes256gcmsiv_aes_ks_enc_x1 +.type aes256gcmsiv_aes_ks_enc_x1,@function +.align 16 +aes256gcmsiv_aes_ks_enc_x1: +.cfi_startproc + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + vmovdqa (%rdi),%xmm8 + vmovdqa (%rcx),%xmm1 + vmovdqa 16(%rcx),%xmm3 + vpxor %xmm1,%xmm8,%xmm8 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm1,(%rdx) + vmovdqu %xmm3,16(%rdx) + vpxor %xmm14,%xmm14,%xmm14 + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,32(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,48(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,64(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,80(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,96(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,112(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,128(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,144(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,160(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,176(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,192(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,208(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenclast %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,224(%rdx) + + vmovdqa %xmm8,(%rsi) + .byte 0xf3,0xc3 +.cfi_endproc +.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1 +.globl aes256gcmsiv_ecb_enc_block +.hidden aes256gcmsiv_ecb_enc_block +.type aes256gcmsiv_ecb_enc_block,@function +.align 16 +aes256gcmsiv_ecb_enc_block: +.cfi_startproc + vmovdqa (%rdi),%xmm1 + vpxor (%rdx),%xmm1,%xmm1 + vaesenc 16(%rdx),%xmm1,%xmm1 + vaesenc 32(%rdx),%xmm1,%xmm1 + vaesenc 48(%rdx),%xmm1,%xmm1 + vaesenc 64(%rdx),%xmm1,%xmm1 + vaesenc 80(%rdx),%xmm1,%xmm1 + vaesenc 96(%rdx),%xmm1,%xmm1 + vaesenc 112(%rdx),%xmm1,%xmm1 + vaesenc 128(%rdx),%xmm1,%xmm1 + vaesenc 144(%rdx),%xmm1,%xmm1 + vaesenc 160(%rdx),%xmm1,%xmm1 + vaesenc 176(%rdx),%xmm1,%xmm1 + vaesenc 192(%rdx),%xmm1,%xmm1 + vaesenc 208(%rdx),%xmm1,%xmm1 + vaesenclast 224(%rdx),%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + .byte 0xf3,0xc3 +.cfi_endproc +.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block +.globl aes256gcmsiv_enc_msg_x4 +.hidden aes256gcmsiv_enc_msg_x4 +.type aes256gcmsiv_enc_msg_x4,@function +.align 16 +aes256gcmsiv_enc_msg_x4: +.cfi_startproc + testq %r8,%r8 + jnz .L256_enc_msg_x4_start + .byte 0xf3,0xc3 + +.L256_enc_msg_x4_start: + movq %r8,%r10 + shrq $4,%r8 + shlq $60,%r10 + jz .L256_enc_msg_x4_start2 + addq $1,%r8 + +.L256_enc_msg_x4_start2: + movq %r8,%r10 + shlq $62,%r10 + shrq $62,%r10 + + + vmovdqa (%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + + vmovdqa four(%rip),%xmm4 + vmovdqa %xmm15,%xmm0 + vpaddd one(%rip),%xmm15,%xmm1 + vpaddd two(%rip),%xmm15,%xmm2 + vpaddd three(%rip),%xmm15,%xmm3 + + shrq $2,%r8 + je .L256_enc_msg_x4_check_remainder + + subq $64,%rsi + subq $64,%rdi + +.L256_enc_msg_x4_loop1: + addq $64,%rsi + addq $64,%rdi + + vmovdqa %xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vmovdqa %xmm2,%xmm7 + vmovdqa %xmm3,%xmm8 + + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqu 32(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm1,%xmm1 + vmovdqu 48(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm2,%xmm2 + vmovdqu 64(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm3,%xmm3 + + vmovdqu 80(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 96(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 112(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 128(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 144(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 176(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 192(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 208(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 224(%rcx),%xmm12 + vaesenclast %xmm12,%xmm5,%xmm5 + vaesenclast %xmm12,%xmm6,%xmm6 + vaesenclast %xmm12,%xmm7,%xmm7 + vaesenclast %xmm12,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm5,%xmm5 + vpxor 16(%rdi),%xmm6,%xmm6 + vpxor 32(%rdi),%xmm7,%xmm7 + vpxor 48(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm5,0(%rsi) + vmovdqu %xmm6,16(%rsi) + vmovdqu %xmm7,32(%rsi) + vmovdqu %xmm8,48(%rsi) + + jne .L256_enc_msg_x4_loop1 + + addq $64,%rsi + addq $64,%rdi + +.L256_enc_msg_x4_check_remainder: + cmpq $0,%r10 + je .L256_enc_msg_x4_out + +.L256_enc_msg_x4_loop2: + + + + vmovdqa %xmm0,%xmm5 + vpaddd one(%rip),%xmm0,%xmm0 + vpxor (%rcx),%xmm5,%xmm5 + vaesenc 16(%rcx),%xmm5,%xmm5 + vaesenc 32(%rcx),%xmm5,%xmm5 + vaesenc 48(%rcx),%xmm5,%xmm5 + vaesenc 64(%rcx),%xmm5,%xmm5 + vaesenc 80(%rcx),%xmm5,%xmm5 + vaesenc 96(%rcx),%xmm5,%xmm5 + vaesenc 112(%rcx),%xmm5,%xmm5 + vaesenc 128(%rcx),%xmm5,%xmm5 + vaesenc 144(%rcx),%xmm5,%xmm5 + vaesenc 160(%rcx),%xmm5,%xmm5 + vaesenc 176(%rcx),%xmm5,%xmm5 + vaesenc 192(%rcx),%xmm5,%xmm5 + vaesenc 208(%rcx),%xmm5,%xmm5 + vaesenclast 224(%rcx),%xmm5,%xmm5 + + + vpxor (%rdi),%xmm5,%xmm5 + + vmovdqu %xmm5,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + subq $1,%r10 + jne .L256_enc_msg_x4_loop2 + +.L256_enc_msg_x4_out: + .byte 0xf3,0xc3 +.cfi_endproc +.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4 +.globl aes256gcmsiv_enc_msg_x8 +.hidden aes256gcmsiv_enc_msg_x8 +.type aes256gcmsiv_enc_msg_x8,@function +.align 16 +aes256gcmsiv_enc_msg_x8: +.cfi_startproc + testq %r8,%r8 + jnz .L256_enc_msg_x8_start + .byte 0xf3,0xc3 + +.L256_enc_msg_x8_start: + + movq %rsp,%r11 + subq $16,%r11 + andq $-64,%r11 + + movq %r8,%r10 + shrq $4,%r8 + shlq $60,%r10 + jz .L256_enc_msg_x8_start2 + addq $1,%r8 + +.L256_enc_msg_x8_start2: + movq %r8,%r10 + shlq $61,%r10 + shrq $61,%r10 + + + vmovdqa (%rdx),%xmm1 + vpor OR_MASK(%rip),%xmm1,%xmm1 + + + vpaddd seven(%rip),%xmm1,%xmm0 + vmovdqa %xmm0,(%r11) + vpaddd one(%rip),%xmm1,%xmm9 + vpaddd two(%rip),%xmm1,%xmm10 + vpaddd three(%rip),%xmm1,%xmm11 + vpaddd four(%rip),%xmm1,%xmm12 + vpaddd five(%rip),%xmm1,%xmm13 + vpaddd six(%rip),%xmm1,%xmm14 + vmovdqa %xmm1,%xmm0 + + shrq $3,%r8 + jz .L256_enc_msg_x8_check_remainder + + subq $128,%rsi + subq $128,%rdi + +.L256_enc_msg_x8_loop1: + addq $128,%rsi + addq $128,%rdi + + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm9,%xmm2 + vmovdqa %xmm10,%xmm3 + vmovdqa %xmm11,%xmm4 + vmovdqa %xmm12,%xmm5 + vmovdqa %xmm13,%xmm6 + vmovdqa %xmm14,%xmm7 + + vmovdqa (%r11),%xmm8 + + vpxor (%rcx),%xmm1,%xmm1 + vpxor (%rcx),%xmm2,%xmm2 + vpxor (%rcx),%xmm3,%xmm3 + vpxor (%rcx),%xmm4,%xmm4 + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqa (%r11),%xmm14 + vpaddd eight(%rip),%xmm14,%xmm14 + vmovdqa %xmm14,(%r11) + vmovdqu 32(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpsubd one(%rip),%xmm14,%xmm14 + vmovdqu 48(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm0,%xmm0 + vmovdqu 64(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm9,%xmm9 + vmovdqu 80(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm10,%xmm10 + vmovdqu 96(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm11,%xmm11 + vmovdqu 112(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm12,%xmm12 + vmovdqu 128(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm13,%xmm13 + vmovdqu 144(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 176(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 192(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 208(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 224(%rcx),%xmm15 + vaesenclast %xmm15,%xmm1,%xmm1 + vaesenclast %xmm15,%xmm2,%xmm2 + vaesenclast %xmm15,%xmm3,%xmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vaesenclast %xmm15,%xmm6,%xmm6 + vaesenclast %xmm15,%xmm7,%xmm7 + vaesenclast %xmm15,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm1,%xmm1 + vpxor 16(%rdi),%xmm2,%xmm2 + vpxor 32(%rdi),%xmm3,%xmm3 + vpxor 48(%rdi),%xmm4,%xmm4 + vpxor 64(%rdi),%xmm5,%xmm5 + vpxor 80(%rdi),%xmm6,%xmm6 + vpxor 96(%rdi),%xmm7,%xmm7 + vpxor 112(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm1,0(%rsi) + vmovdqu %xmm2,16(%rsi) + vmovdqu %xmm3,32(%rsi) + vmovdqu %xmm4,48(%rsi) + vmovdqu %xmm5,64(%rsi) + vmovdqu %xmm6,80(%rsi) + vmovdqu %xmm7,96(%rsi) + vmovdqu %xmm8,112(%rsi) + + jne .L256_enc_msg_x8_loop1 + + addq $128,%rsi + addq $128,%rdi + +.L256_enc_msg_x8_check_remainder: + cmpq $0,%r10 + je .L256_enc_msg_x8_out + +.L256_enc_msg_x8_loop2: + + + vmovdqa %xmm0,%xmm1 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm1,%xmm1 + vaesenc 16(%rcx),%xmm1,%xmm1 + vaesenc 32(%rcx),%xmm1,%xmm1 + vaesenc 48(%rcx),%xmm1,%xmm1 + vaesenc 64(%rcx),%xmm1,%xmm1 + vaesenc 80(%rcx),%xmm1,%xmm1 + vaesenc 96(%rcx),%xmm1,%xmm1 + vaesenc 112(%rcx),%xmm1,%xmm1 + vaesenc 128(%rcx),%xmm1,%xmm1 + vaesenc 144(%rcx),%xmm1,%xmm1 + vaesenc 160(%rcx),%xmm1,%xmm1 + vaesenc 176(%rcx),%xmm1,%xmm1 + vaesenc 192(%rcx),%xmm1,%xmm1 + vaesenc 208(%rcx),%xmm1,%xmm1 + vaesenclast 224(%rcx),%xmm1,%xmm1 + + + vpxor (%rdi),%xmm1,%xmm1 + + vmovdqu %xmm1,(%rsi) + + addq $16,%rdi + addq $16,%rsi + subq $1,%r10 + jnz .L256_enc_msg_x8_loop2 + +.L256_enc_msg_x8_out: + .byte 0xf3,0xc3 + +.cfi_endproc +.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8 +.globl aes256gcmsiv_dec +.hidden aes256gcmsiv_dec +.type aes256gcmsiv_dec,@function +.align 16 +aes256gcmsiv_dec: +.cfi_startproc + testq $~15,%r9 + jnz .L256_dec_start + .byte 0xf3,0xc3 + +.L256_dec_start: + vzeroupper + vmovdqa (%rdx),%xmm0 + movq %rdx,%rax + + leaq 32(%rax),%rax + leaq 32(%rcx),%rcx + + + vmovdqu (%rdi,%r9,1),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + andq $~15,%r9 + + + cmpq $96,%r9 + jb .L256_dec_loop2 + + + subq $96,%r9 + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vpxor (%r8),%xmm7,%xmm7 + vpxor (%r8),%xmm8,%xmm8 + vpxor (%r8),%xmm9,%xmm9 + vpxor (%r8),%xmm10,%xmm10 + vpxor (%r8),%xmm11,%xmm11 + vpxor (%r8),%xmm12,%xmm12 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 176(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 192(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 208(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 224(%r8),%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vaesenclast %xmm4,%xmm8,%xmm8 + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm4,%xmm10,%xmm10 + vaesenclast %xmm4,%xmm11,%xmm11 + vaesenclast %xmm4,%xmm12,%xmm12 + + + vpxor 0(%rdi),%xmm7,%xmm7 + vpxor 16(%rdi),%xmm8,%xmm8 + vpxor 32(%rdi),%xmm9,%xmm9 + vpxor 48(%rdi),%xmm10,%xmm10 + vpxor 64(%rdi),%xmm11,%xmm11 + vpxor 80(%rdi),%xmm12,%xmm12 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + addq $96,%rdi + addq $96,%rsi + jmp .L256_dec_loop1 + + +.align 64 +.L256_dec_loop1: + cmpq $96,%r9 + jb .L256_dec_finish_96 + subq $96,%r9 + + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vmovdqa (%r8),%xmm4 + vpxor %xmm4,%xmm7,%xmm7 + vpxor %xmm4,%xmm8,%xmm8 + vpxor %xmm4,%xmm9,%xmm9 + vpxor %xmm4,%xmm10,%xmm10 + vpxor %xmm4,%xmm11,%xmm11 + vpxor %xmm4,%xmm12,%xmm12 + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vmovdqa 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 176(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 192(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 208(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 224(%r8),%xmm6 + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor 0(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vpxor 16(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm8,%xmm8 + vpxor 32(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm9,%xmm9 + vpxor 48(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm10,%xmm10 + vpxor 64(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm11,%xmm11 + vpxor 80(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm12,%xmm12 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + vpxor %xmm5,%xmm0,%xmm0 + + leaq 96(%rdi),%rdi + leaq 96(%rsi),%rsi + jmp .L256_dec_loop1 + +.L256_dec_finish_96: + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor %xmm5,%xmm0,%xmm0 + +.L256_dec_loop2: + + + + cmpq $16,%r9 + jb .L256_dec_out + subq $16,%r9 + + vmovdqa %xmm15,%xmm2 + vpaddd one(%rip),%xmm15,%xmm15 + + vpxor 0(%r8),%xmm2,%xmm2 + vaesenc 16(%r8),%xmm2,%xmm2 + vaesenc 32(%r8),%xmm2,%xmm2 + vaesenc 48(%r8),%xmm2,%xmm2 + vaesenc 64(%r8),%xmm2,%xmm2 + vaesenc 80(%r8),%xmm2,%xmm2 + vaesenc 96(%r8),%xmm2,%xmm2 + vaesenc 112(%r8),%xmm2,%xmm2 + vaesenc 128(%r8),%xmm2,%xmm2 + vaesenc 144(%r8),%xmm2,%xmm2 + vaesenc 160(%r8),%xmm2,%xmm2 + vaesenc 176(%r8),%xmm2,%xmm2 + vaesenc 192(%r8),%xmm2,%xmm2 + vaesenc 208(%r8),%xmm2,%xmm2 + vaesenclast 224(%r8),%xmm2,%xmm2 + vpxor (%rdi),%xmm2,%xmm2 + vmovdqu %xmm2,(%rsi) + addq $16,%rdi + addq $16,%rsi + + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa -32(%rcx),%xmm1 + call GFMUL + + jmp .L256_dec_loop2 + +.L256_dec_out: + vmovdqu %xmm0,(%rdx) + .byte 0xf3,0xc3 +.cfi_endproc +.size aes256gcmsiv_dec, .-aes256gcmsiv_dec +.globl aes256gcmsiv_kdf +.hidden aes256gcmsiv_kdf +.type aes256gcmsiv_kdf,@function +.align 16 +aes256gcmsiv_kdf: +.cfi_startproc + + + + + vmovdqa (%rdx),%xmm1 + vmovdqa 0(%rdi),%xmm4 + vmovdqa and_mask(%rip),%xmm11 + vmovdqa one(%rip),%xmm8 + vpshufd $0x90,%xmm4,%xmm4 + vpand %xmm11,%xmm4,%xmm4 + vpaddd %xmm8,%xmm4,%xmm6 + vpaddd %xmm8,%xmm6,%xmm7 + vpaddd %xmm8,%xmm7,%xmm11 + vpaddd %xmm8,%xmm11,%xmm12 + vpaddd %xmm8,%xmm12,%xmm13 + + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm1,%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm1,%xmm11,%xmm11 + vpxor %xmm1,%xmm12,%xmm12 + vpxor %xmm1,%xmm13,%xmm13 + + vmovdqa 16(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 32(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 48(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 64(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 80(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 96(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 112(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 128(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 144(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 160(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 176(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 192(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 208(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 224(%rdx),%xmm2 + vaesenclast %xmm2,%xmm4,%xmm4 + vaesenclast %xmm2,%xmm6,%xmm6 + vaesenclast %xmm2,%xmm7,%xmm7 + vaesenclast %xmm2,%xmm11,%xmm11 + vaesenclast %xmm2,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + + + vmovdqa %xmm4,0(%rsi) + vmovdqa %xmm6,16(%rsi) + vmovdqa %xmm7,32(%rsi) + vmovdqa %xmm11,48(%rsi) + vmovdqa %xmm12,64(%rsi) + vmovdqa %xmm13,80(%rsi) + .byte 0xf3,0xc3 +.cfi_endproc +.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf +#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S new file mode 100644 index 0000000000..a6f5e07d9c --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S @@ -0,0 +1,8974 @@ +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +.text +.extern OPENSSL_ia32cap_P +.hidden OPENSSL_ia32cap_P + +chacha20_poly1305_constants: + +.align 64 +.chacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.rol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.rol16: +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.avx2_init: +.long 0,0,0,0 +.sse_inc: +.long 1,0,0,0 +.avx2_inc: +.long 2,0,0,0,2,0,0,0 +.clamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC +.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF +.align 16 +.and_masks: +.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + +.type poly_hash_ad_internal,@function +.align 64 +poly_hash_ad_internal: +.cfi_startproc + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + cmpq $13,%r8 + jne hash_ad_loop +poly_fast_tls_ad: + + movq (%rcx),%r10 + movq 5(%rcx),%r11 + shrq $24,%r11 + movq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + .byte 0xf3,0xc3 +hash_ad_loop: + + cmpq $16,%r8 + jb hash_ad_tail + addq 0(%rcx),%r10 + adcq 8+0(%rcx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rcx),%rcx + subq $16,%r8 + jmp hash_ad_loop +hash_ad_tail: + cmpq $0,%r8 + je 1f + + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + addq %r8,%rcx +hash_ad_tail_loop: + shldq $8,%r13,%r14 + shlq $8,%r13 + movzbq -1(%rcx),%r15 + xorq %r15,%r13 + decq %rcx + decq %r8 + jne hash_ad_tail_loop + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +1: + .byte 0xf3,0xc3 +.cfi_endproc +.size poly_hash_ad_internal, .-poly_hash_ad_internal + +.globl chacha20_poly1305_open +.hidden chacha20_poly1305_open +.type chacha20_poly1305_open,@function +.align 64 +chacha20_poly1305_open: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 + pushq %rbx +.cfi_adjust_cfa_offset 8 + pushq %r12 +.cfi_adjust_cfa_offset 8 + pushq %r13 +.cfi_adjust_cfa_offset 8 + pushq %r14 +.cfi_adjust_cfa_offset 8 + pushq %r15 +.cfi_adjust_cfa_offset 8 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 + subq $288 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 +.cfi_offset rbp, -16 +.cfi_offset rbx, -24 +.cfi_offset r12, -32 +.cfi_offset r13, -40 +.cfi_offset r14, -48 +.cfi_offset r15, -56 + leaq 32(%rsp),%rbp + andq $-32,%rbp + movq %rdx,8+32(%rbp) + movq %r8,0+32(%rbp) + movq %rdx,%rbx + + movl OPENSSL_ia32cap_P+8(%rip),%eax + andl $288,%eax + xorl $288,%eax + jz chacha20_poly1305_open_avx2 + +1: + cmpq $128,%rbx + jbe open_sse_128 + + movdqa .chacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm7 + + movdqa %xmm4,48(%rbp) + movdqa %xmm8,64(%rbp) + movdqa %xmm12,96(%rbp) + movq $10,%r10 +1: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %r10 + jne 1b + + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + + pand .clamp(%rip),%xmm0 + movdqa %xmm0,0(%rbp) + movdqa %xmm4,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +open_sse_main_loop: + cmpq $256,%rbx + jb 2f + + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 96(%rbp),%xmm15 + paddd .sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + + + + movq $4,%rcx + movq %rsi,%r8 +1: + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + + leaq 16(%r8),%r8 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %rcx + jge 1b + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + cmpq $-6,%rcx + jg 1b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqa %xmm12,80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor 80(%rbp),%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp open_sse_main_loop +2: + + testq %rbx,%rbx + jz open_sse_finalize + cmpq $64,%rbx + ja 3f + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa 96(%rbp),%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + cmpq $16,%rcx + jb 2f +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx +2: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + cmpq $16,%rcx + jae 1b + cmpq $160,%r8 + jne 2b + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + jmp open_sse_tail_64_dec_loop +3: + cmpq $128,%rbx + ja 3f + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 96(%rbp),%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + + movq %rbx,%rcx + andq $-16,%rcx + xorq %r8,%r8 +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +2: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + + cmpq %rcx,%r8 + jb 1b + cmpq $160,%r8 + jne 2b + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + subq $64,%rbx + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jmp open_sse_tail_64_dec_loop +3: + cmpq $192,%rbx + ja 3f + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 96(%rbp),%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + + movq %rbx,%rcx + movq $160,%r8 + cmpq $160,%rcx + cmovgq %r8,%rcx + andq $-16,%rcx + xorq %r8,%r8 +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +2: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + cmpq %rcx,%r8 + jb 1b + cmpq $160,%r8 + jne 2b + cmpq $176,%rbx + jb 1f + addq 160(%rsi),%r10 + adcq 8+160(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + cmpq $192,%rbx + jb 1f + addq 176(%rsi),%r10 + adcq 8+176(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +1: + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + subq $128,%rbx + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + jmp open_sse_tail_64_dec_loop +3: + + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 96(%rbp),%xmm15 + paddd .sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + + xorq %r8,%r8 +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movdqa %xmm11,80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + movdqa 80(%rbp),%xmm11 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa %xmm9,80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 + movdqa 80(%rbp),%xmm9 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + movdqa %xmm11,80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + movdqa 80(%rbp),%xmm11 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + movdqa %xmm9,80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 + movdqa 80(%rbp),%xmm9 + + addq $16,%r8 + cmpq $160,%r8 + jb 1b + movq %rbx,%rcx + andq $-16,%rcx +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%r8 + cmpq %rcx,%r8 + jb 1b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqa %xmm12,80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movdqa 80(%rbp),%xmm12 + subq $192,%rbx + leaq 192(%rsi),%rsi + leaq 192(%rdi),%rdi + + +open_sse_tail_64_dec_loop: + cmpq $16,%rbx + jb 1f + subq $16,%rbx + movdqu (%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + jmp open_sse_tail_64_dec_loop +1: + movdqa %xmm0,%xmm1 + + +open_sse_tail_16: + testq %rbx,%rbx + jz open_sse_finalize + + + + pxor %xmm3,%xmm3 + leaq -1(%rsi,%rbx), %rsi + movq %rbx,%r8 +2: + pslldq $1,%xmm3 + pinsrb $0,(%rsi),%xmm3 + subq $1,%rsi + subq $1,%r8 + jnz 2b + +3: +.byte 102,73,15,126,221 + pextrq $1,%xmm3,%r14 + + pxor %xmm1,%xmm3 + + +2: + pextrb $0,%xmm3,(%rdi) + psrldq $1,%xmm3 + addq $1,%rdi + subq $1,%rbx + jne 2b + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +open_sse_finalize: + addq 32(%rbp),%r10 + adcq 8+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+16(%rbp),%r10 + adcq 8+16(%rbp),%r11 + + addq $288 + 32,%rsp +.cfi_adjust_cfa_offset -(288 + 32) + popq %r9 +.cfi_adjust_cfa_offset -8 + movq %r10,(%r9) + movq %r11,8(%r9) + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + popq %rbx +.cfi_adjust_cfa_offset -8 + popq %rbp +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 + +open_sse_128: + movdqu .chacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm13,%xmm15 + movq $10,%r10 +1: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + decq %r10 + jnz 1b + paddd .chacha20_consts(%rip),%xmm0 + paddd .chacha20_consts(%rip),%xmm1 + paddd .chacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm9 + paddd %xmm11,%xmm10 + paddd %xmm15,%xmm13 + paddd .sse_inc(%rip),%xmm15 + paddd %xmm15,%xmm14 + + pand .clamp(%rip),%xmm0 + movdqa %xmm0,0(%rbp) + movdqa %xmm4,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +1: + cmpq $16,%rbx + jb open_sse_tail_16 + subq $16,%rbx + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm1 + movdqu %xmm1,0(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + movdqa %xmm2,%xmm13 + movdqa %xmm6,%xmm2 + movdqa %xmm10,%xmm6 + movdqa %xmm14,%xmm10 + jmp 1b + jmp open_sse_tail_16 +.size chacha20_poly1305_open, .-chacha20_poly1305_open +.cfi_endproc + + + + +.globl chacha20_poly1305_seal +.hidden chacha20_poly1305_seal +.type chacha20_poly1305_seal,@function +.align 64 +chacha20_poly1305_seal: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 + pushq %rbx +.cfi_adjust_cfa_offset 8 + pushq %r12 +.cfi_adjust_cfa_offset 8 + pushq %r13 +.cfi_adjust_cfa_offset 8 + pushq %r14 +.cfi_adjust_cfa_offset 8 + pushq %r15 +.cfi_adjust_cfa_offset 8 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 + subq $288 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 +.cfi_offset rbp, -16 +.cfi_offset rbx, -24 +.cfi_offset r12, -32 +.cfi_offset r13, -40 +.cfi_offset r14, -48 +.cfi_offset r15, -56 + leaq 32(%rsp),%rbp + andq $-32,%rbp + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %rbx,8+32(%rbp) + movq %r8,0+32(%rbp) + movq %rdx,%rbx + + movl OPENSSL_ia32cap_P+8(%rip),%eax + andl $288,%eax + xorl $288,%eax + jz chacha20_poly1305_seal_avx2 + + cmpq $128,%rbx + jbe seal_sse_128 + + movdqa .chacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm14 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd .sse_inc(%rip),%xmm12 + + movdqa %xmm4,48(%rbp) + movdqa %xmm8,64(%rbp) + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + movq $10,%r10 +1: + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %r10 + jnz 1b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + + pand .clamp(%rip),%xmm3 + movdqa %xmm3,0(%rbp) + movdqa %xmm7,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + cmpq $192,%rbx + ja 1f + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + jmp seal_sse_128_seal_hash +1: + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 128(%rdi) + movdqu %xmm4,16 + 128(%rdi) + movdqu %xmm8,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + movq $2,%rcx + movq $8,%r8 + cmpq $64,%rbx + jbe seal_sse_tail_64 + cmpq $128,%rbx + jbe seal_sse_tail_128 + cmpq $192,%rbx + jbe seal_sse_tail_192 + +1: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 96(%rbp),%xmm15 + paddd .sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + +2: + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + leaq 16(%rdi),%rdi + decq %r8 + jge 2b + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg 2b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + movdqa %xmm14,80(%rbp) + movdqa %xmm14,80(%rbp) + movdqu 0 + 0(%rsi),%xmm14 + pxor %xmm3,%xmm14 + movdqu %xmm14,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm14 + pxor %xmm7,%xmm14 + movdqu %xmm14,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm14 + pxor %xmm11,%xmm14 + movdqu %xmm14,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm14 + pxor %xmm15,%xmm14 + movdqu %xmm14,48 + 0(%rdi) + + movdqa 80(%rbp),%xmm14 + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + cmpq $256,%rbx + ja 3f + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + jmp seal_sse_128_seal_hash +3: + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + subq $256,%rbx + movq $6,%rcx + movq $4,%r8 + cmpq $192,%rbx + jg 1b + movq %rbx,%rcx + testq %rbx,%rbx + je seal_sse_128_seal_hash + movq $6,%rcx + cmpq $64,%rbx + jg 3f + +seal_sse_tail_64: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa 96(%rbp),%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + jmp seal_sse_128_seal +3: + cmpq $128,%rbx + jg 3f + +seal_sse_tail_128: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 96(%rbp),%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + + leaq 16(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + movq $64,%rcx + subq $64,%rbx + leaq 64(%rsi),%rsi + jmp seal_sse_128_seal_hash +3: + +seal_sse_tail_192: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 96(%rbp),%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + leaq 16(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + +seal_sse_128_seal_hash: + cmpq $16,%rcx + jb seal_sse_128_seal + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + leaq 16(%rdi),%rdi + jmp seal_sse_128_seal_hash + +seal_sse_128_seal: + cmpq $16,%rbx + jb seal_sse_tail_16 + subq $16,%rbx + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,0(%rdi) + + addq 0(%rdi),%r10 + adcq 8(%rdi),%r11 + adcq $1,%r12 + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + movdqa %xmm1,%xmm12 + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + jmp seal_sse_128_seal + +seal_sse_tail_16: + testq %rbx,%rbx + jz process_blocks_of_extra_in + + movq %rbx,%r8 + movq %rbx,%rcx + leaq -1(%rsi,%rbx), %rsi + pxor %xmm15,%xmm15 +1: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + decq %rcx + jne 1b + + + pxor %xmm0,%xmm15 + + + movq %rbx,%rcx + movdqu %xmm15,%xmm0 +2: + pextrb $0,%xmm0,(%rdi) + psrldq $1,%xmm0 + addq $1,%rdi + subq $1,%rcx + jnz 2b + + + + + + + + + movq 288+32(%rsp),%r9 + movq 56(%r9),%r14 + movq 48(%r9),%r13 + testq %r14,%r14 + jz process_partial_block + + movq $16,%r15 + subq %rbx,%r15 + cmpq %r15,%r14 + + jge load_extra_in + movq %r14,%r15 + +load_extra_in: + + + leaq -1(%r13,%r15), %rsi + + + addq %r15,%r13 + subq %r15,%r14 + movq %r13,48(%r9) + movq %r14,56(%r9) + + + + addq %r15,%r8 + + + pxor %xmm11,%xmm11 +3: + pslldq $1,%xmm11 + pinsrb $0,(%rsi),%xmm11 + leaq -1(%rsi),%rsi + subq $1,%r15 + jnz 3b + + + + + movq %rbx,%r15 + +4: + pslldq $1,%xmm11 + subq $1,%r15 + jnz 4b + + + + + leaq .and_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx), %xmm15 + + + por %xmm11,%xmm15 + + + +.byte 102,77,15,126,253 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +process_blocks_of_extra_in: + + movq 288+32(%rsp),%r9 + movq 48(%r9),%rsi + movq 56(%r9),%r8 + movq %r8,%rcx + shrq $4,%r8 + +5: + jz process_extra_in_trailer + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rsi),%rsi + subq $1,%r8 + jmp 5b + +process_extra_in_trailer: + andq $15,%rcx + movq %rcx,%rbx + jz do_length_block + leaq -1(%rsi,%rcx), %rsi + +6: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + subq $1,%rcx + jnz 6b + +process_partial_block: + + leaq .and_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx), %xmm15 +.byte 102,77,15,126,253 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +do_length_block: + addq 32(%rbp),%r10 + adcq 8+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+16(%rbp),%r10 + adcq 8+16(%rbp),%r11 + + addq $288 + 32,%rsp +.cfi_adjust_cfa_offset -(288 + 32) + popq %r9 +.cfi_adjust_cfa_offset -8 + movq %r10,0(%r9) + movq %r11,8(%r9) + + popq %r15 +.cfi_adjust_cfa_offset -8 + popq %r14 +.cfi_adjust_cfa_offset -8 + popq %r13 +.cfi_adjust_cfa_offset -8 + popq %r12 +.cfi_adjust_cfa_offset -8 + popq %rbx +.cfi_adjust_cfa_offset -8 + popq %rbp +.cfi_adjust_cfa_offset -8 + .byte 0xf3,0xc3 +.cfi_adjust_cfa_offset (8 * 6) + 288 + 32 + +seal_sse_128: + movdqu .chacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm14 + movdqa %xmm14,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + movq $10,%r10 +1: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + decq %r10 + jnz 1b + paddd .chacha20_consts(%rip),%xmm0 + paddd .chacha20_consts(%rip),%xmm1 + paddd .chacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm8 + paddd %xmm11,%xmm9 + paddd %xmm15,%xmm12 + paddd .sse_inc(%rip),%xmm15 + paddd %xmm15,%xmm13 + + pand .clamp(%rip),%xmm2 + movdqa %xmm2,0(%rbp) + movdqa %xmm6,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + jmp seal_sse_128_seal +.size chacha20_poly1305_seal, .-chacha20_poly1305_seal + + +.type chacha20_poly1305_open_avx2,@function +.align 64 +chacha20_poly1305_open_avx2: + vzeroupper + vmovdqa .chacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd .avx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe open_avx2_192 + cmpq $320,%rbx + jbe open_avx2_320 + + vmovdqa %ymm4,64(%rbp) + vmovdqa %ymm8,96(%rbp) + vmovdqa %ymm12,160(%rbp) + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + decq %r10 + jne 1b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + + movq %r8,%r8 + call poly_hash_ad_internal + xorq %rcx,%rcx + +1: + addq 0(%rsi,%rcx), %r10 + adcq 8+0(%rsi,%rcx), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%rcx + cmpq $64,%rcx + jne 1b + + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm4,32(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + subq $64,%rbx +1: + + cmpq $512,%rbx + jb 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + + xorq %rcx,%rcx +2: + addq 0*8(%rsi,%rcx), %r10 + adcq 8+0*8(%rsi,%rcx), %r11 + adcq $1,%r12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + addq %rax,%r15 + adcq %rdx,%r9 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + addq 2*8(%rsi,%rcx), %r10 + adcq 8+2*8(%rsi,%rcx), %r11 + adcq $1,%r12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + addq 4*8(%rsi,%rcx), %r10 + adcq 8+4*8(%rsi,%rcx), %r11 + adcq $1,%r12 + + leaq 48(%rcx),%rcx + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + cmpq $60*8,%rcx + jne 2b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,128(%rbp) + addq 60*8(%rsi),%r10 + adcq 8+60*8(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + addq 60*8+16(%rsi),%r10 + adcq 8+60*8+16(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + leaq 512(%rdi),%rdi + subq $512,%rbx + jmp 1b +3: + testq %rbx,%rbx + vzeroupper + je open_sse_finalize +3: + cmpq $128,%rbx + ja 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + andq $-16,%rcx + testq %rcx,%rcx + je 2f +1: + addq 0*8(%rsi,%r8), %r10 + adcq 8+0*8(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +2: + addq $16,%r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb 1b + cmpq $160,%r8 + jne 2b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp open_avx2_tail_loop +3: + cmpq $256,%rbx + ja 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + + movq %rbx,128(%rbp) + movq %rbx,%rcx + subq $128,%rcx + shrq $4,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +1: + addq 0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + + incq %r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + cmpq %rcx,%r8 + jb 1b + cmpq $10,%r8 + jne 2b + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 128(%rbp),%rbx +1: + addq $16,%rcx + cmpq %rbx,%rcx + jg 1f + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp 1b +1: + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + subq $128,%rbx + jmp open_avx2_tail_loop +3: + cmpq $384,%rbx + ja 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + + movq %rbx,128(%rbp) + movq %rbx,%rcx + subq $256,%rcx + shrq $4,%rcx + addq $6,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +1: + addq 0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +2: + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx + incq %r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb 1b + cmpq $10,%r8 + jne 2b + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 128(%rbp),%rbx +1: + addq $16,%rcx + cmpq %rbx,%rcx + jg 1f + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp 1b +1: + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp open_avx2_tail_loop +3: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + + xorq %rcx,%rcx + movq %rsi,%r8 +1: + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 +2: + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + addq 16(%r8),%r10 + adcq 8+16(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%r8),%r8 + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + incq %rcx + cmpq $4,%rcx + jl 1b + cmpq $10,%rcx + jne 2b + movq %rbx,%rcx + subq $384,%rcx + andq $-16,%rcx +1: + testq %rcx,%rcx + je 1f + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + subq $16,%rcx + jmp 1b +1: + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 384(%rsi),%rsi + leaq 384(%rdi),%rdi + subq $384,%rbx +open_avx2_tail_loop: + cmpq $32,%rbx + jb open_avx2_tail + subq $32,%rbx + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + jmp open_avx2_tail_loop +open_avx2_tail: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb 1f + subq $16,%rbx + + vpxor (%rsi),%xmm0,%xmm1 + vmovdqu %xmm1,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 + vmovdqa %xmm0,%xmm1 +1: + vzeroupper + jmp open_sse_tail_16 + +open_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne 1b + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +open_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal +open_avx2_hash_and_xor_loop: + cmpq $32,%rbx + jb open_avx2_short_tail_32 + subq $32,%rbx + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 16(%rsi),%r10 + adcq 8+16(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp open_avx2_hash_and_xor_loop +open_avx2_short_tail_32: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb 1f + subq $16,%rbx + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm1 +1: + vzeroupper + jmp open_sse_tail_16 + +open_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vpaddd .avx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne 1b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 160(%rbp),%ymm12,%ymm12 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd 224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp open_avx2_short +.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 + + +.type chacha20_poly1305_seal_avx2,@function +.align 64 +chacha20_poly1305_seal_avx2: + vzeroupper + vmovdqa .chacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd .avx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe seal_avx2_192 + cmpq $320,%rbx + jbe seal_avx2_320 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm4,64(%rbp) + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm8,96(%rbp) + vmovdqa %ymm12,%ymm15 + vpaddd .avx2_inc(%rip),%ymm15,%ymm14 + vpaddd .avx2_inc(%rip),%ymm14,%ymm13 + vpaddd .avx2_inc(%rip),%ymm13,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm15,256(%rbp) + movq $10,%r10 +1: + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %r10 + jnz 1b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 + vpand .clamp(%rip),%ymm15,%ymm15 + vmovdqa %ymm15,0(%rbp) + movq %r8,%r8 + call poly_hash_ad_internal + + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm11,32(%rdi) + vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+64(%rsi),%ymm15,%ymm15 + vpxor 32+64(%rsi),%ymm2,%ymm2 + vpxor 64+64(%rsi),%ymm6,%ymm6 + vpxor 96+64(%rsi),%ymm10,%ymm10 + vmovdqu %ymm15,0+64(%rdi) + vmovdqu %ymm2,32+64(%rdi) + vmovdqu %ymm6,64+64(%rdi) + vmovdqu %ymm10,96+64(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+192(%rsi),%ymm15,%ymm15 + vpxor 32+192(%rsi),%ymm1,%ymm1 + vpxor 64+192(%rsi),%ymm5,%ymm5 + vpxor 96+192(%rsi),%ymm9,%ymm9 + vmovdqu %ymm15,0+192(%rdi) + vmovdqu %ymm1,32+192(%rdi) + vmovdqu %ymm5,64+192(%rdi) + vmovdqu %ymm9,96+192(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm15,%ymm8 + + leaq 320(%rsi),%rsi + subq $320,%rbx + movq $320,%rcx + cmpq $128,%rbx + jbe seal_avx2_hash + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + vpxor 64(%rsi),%ymm8,%ymm8 + vpxor 96(%rsi),%ymm12,%ymm12 + vmovdqu %ymm0,320(%rdi) + vmovdqu %ymm4,352(%rdi) + vmovdqu %ymm8,384(%rdi) + vmovdqu %ymm12,416(%rdi) + leaq 128(%rsi),%rsi + subq $128,%rbx + movq $8,%rcx + movq $2,%r8 + cmpq $128,%rbx + jbe seal_avx2_tail_128 + cmpq $256,%rbx + jbe seal_avx2_tail_256 + cmpq $384,%rbx + jbe seal_avx2_tail_384 + cmpq $512,%rbx + jbe seal_avx2_tail_512 + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + + subq $16,%rdi + movq $9,%rcx + jmp 4f +1: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + + movq $10,%rcx +2: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + addq %rax,%r15 + adcq %rdx,%r9 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +4: + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + addq 32(%rdi),%r10 + adcq 8+32(%rdi),%r11 + adcq $1,%r12 + + leaq 48(%rdi),%rdi + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %rcx + jne 2b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + leaq 32(%rdi),%rdi + vmovdqa %ymm0,128(%rbp) + addq -32(%rdi),%r10 + adcq 8+-32(%rdi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + addq -16(%rdi),%r10 + adcq 8+-16(%rdi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + subq $512,%rbx + cmpq $512,%rbx + jg 1b + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + movq $10,%rcx + xorq %r8,%r8 + cmpq $128,%rbx + ja 3f + +seal_avx2_tail_128: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp seal_avx2_short_loop +3: + cmpq $256,%rbx + ja 3f + +seal_avx2_tail_256: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $128,%rcx + leaq 128(%rsi),%rsi + subq $128,%rbx + jmp seal_avx2_hash +3: + cmpq $384,%rbx + ja seal_avx2_tail_512 + +seal_avx2_tail_384: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $256,%rcx + leaq 256(%rsi),%rsi + subq $256,%rbx + jmp seal_avx2_hash + +seal_avx2_tail_512: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + addq %rax,%r15 + adcq %rdx,%r9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + + + + + + + + + + + + addq %rax,%r15 + adcq %rdx,%r9 + + + + + + + + + + + + + + + + + + + + + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $384,%rcx + leaq 384(%rsi),%rsi + subq $384,%rbx + jmp seal_avx2_hash + +seal_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vpaddd .avx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne 1b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 160(%rbp),%ymm12,%ymm12 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd 224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp seal_avx2_short + +seal_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne 1b + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +seal_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal + xorq %rcx,%rcx +seal_avx2_hash: + cmpq $16,%rcx + jb seal_avx2_short_loop + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + addq $16,%rdi + jmp seal_avx2_hash +seal_avx2_short_loop: + cmpq $32,%rbx + jb seal_avx2_short_tail + subq $32,%rbx + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp seal_avx2_short_loop +seal_avx2_short_tail: + cmpq $16,%rbx + jb 1f + subq $16,%rbx + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm0 +1: + vzeroupper + jmp seal_sse_tail_16 +.cfi_endproc +#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/aes/aes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S similarity index 99% rename from packager/third_party/boringssl/linux-x86_64/crypto/aes/aes-x86_64.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S index 361e84c77f..ff87f9824e 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/aes/aes-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aes-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .type _x86_64_AES_encrypt,@function .align 16 @@ -332,6 +332,7 @@ _x86_64_AES_encrypt_compact: .type asm_AES_encrypt,@function .hidden asm_AES_encrypt asm_AES_encrypt: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 @@ -340,7 +341,6 @@ asm_AES_encrypt: pushq %r15 - movq %rsp,%r10 leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -350,7 +350,7 @@ asm_AES_encrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) .Lenc_prologue: movq %rdx,%r15 @@ -382,13 +382,13 @@ asm_AES_encrypt: movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lenc_epilogue: .byte 0xf3,0xc3 .size asm_AES_encrypt,.-asm_AES_encrypt @@ -778,6 +778,7 @@ _x86_64_AES_decrypt_compact: .type asm_AES_decrypt,@function .hidden asm_AES_decrypt asm_AES_decrypt: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 @@ -786,7 +787,6 @@ asm_AES_decrypt: pushq %r15 - movq %rsp,%r10 leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -796,7 +796,7 @@ asm_AES_decrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) .Ldec_prologue: movq %rdx,%r15 @@ -830,13 +830,13 @@ asm_AES_decrypt: movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Ldec_epilogue: .byte 0xf3,0xc3 .size asm_AES_decrypt,.-asm_AES_decrypt @@ -1313,12 +1313,12 @@ asm_AES_cbc_encrypt: movl %r9d,%r9d leaq .LAES_Te(%rip),%r14 + leaq .LAES_Td(%rip),%r10 cmpq $0,%r9 - jne .Lcbc_picked_te - leaq .LAES_Td(%rip),%r14 -.Lcbc_picked_te: + cmoveq %r10,%r14 - movl OPENSSL_ia32cap_P(%rip),%r10d + leaq OPENSSL_ia32cap_P(%rip),%r10 + movl (%r10),%r10d cmpq $512,%rdx jb .Lcbc_slow_prologue testq $15,%rdx diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S new file mode 100644 index 0000000000..e7b4c48bef --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S @@ -0,0 +1,834 @@ +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +.text + +.type _aesni_ctr32_ghash_6x,@function +.align 32 +_aesni_ctr32_ghash_6x: +.cfi_startproc + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp .Loop6x + +.align 32 +.Loop6x: + addl $100663296,%ebx + jc .Lhandle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +.Lresume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + + + + + + + + + + + + + + + + + + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $11,%ebp + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + je .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp .Lenc_tail + +.align 32 +.Lhandle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%r10 + subq $0x6,%rdx + jc .L6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + .byte 0xf3,0xc3 +.cfi_endproc +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +.globl aesni_gcm_decrypt +.hidden aesni_gcm_decrypt +.type aesni_gcm_decrypt,@function +.align 32 +aesni_gcm_decrypt: +.cfi_startproc + xorq %r10,%r10 + + + + cmpq $0x60,%rdx + jb .Lgcm_dec_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r9),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + leaq 32+32(%r9),%r9 + movl 240-128(%rcx),%ebp + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Ldec_no_key_aliasing + cmpq $768,%r15 + jnc .Ldec_no_key_aliasing + subq %r15,%rsp +.Ldec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + leaq (%rdi),%r14 + vmovdqu 64(%rdi),%xmm4 + + + + + + + + leaq -192(%rdi,%rdx,1),%r15 + + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %r10,%r10 + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + + call _aesni_ctr32_ghash_6x + + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_dec_abort: + movq %r10,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +.type _aesni_ctr32_6x,@function +.align 32 +_aesni_ctr32_6x: +.cfi_startproc + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -1(%rbp),%r13 + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc .Lhandle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + .byte 0xf3,0xc3 +.align 32 +.Lhandle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 +.cfi_endproc +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.hidden aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +.align 32 +aesni_gcm_encrypt: +.cfi_startproc + xorq %r10,%r10 + + + + + cmpq $288,%rdx + jb .Lgcm_enc_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 240-128(%rcx),%ebp + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Lenc_no_key_aliasing + cmpq $768,%r15 + jnc .Lenc_no_key_aliasing + subq %r15,%rsp +.Lenc_no_key_aliasing: + + leaq (%rsi),%r14 + + + + + + + + + leaq -192(%rsi,%rdx,1),%r15 + + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + vmovdqu (%r9),%xmm8 + leaq 32+32(%r9),%r9 + subq $12,%rdx + movq $192,%r10 + vpshufb %xmm0,%xmm8,%xmm8 + + call _aesni_ctr32_ghash_6x + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_enc_abort: + movq %r10,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 +#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/aes/aesni-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S similarity index 81% rename from packager/third_party/boringssl/linux-x86_64/crypto/aes/aesni-x86_64.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S index 5709a2d024..0c980a304b 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/aes/aesni-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -1032,11 +1032,10 @@ aesni_ctr32_encrypt_blocks: .align 16 .Lctr32_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $128,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp @@ -1045,7 +1044,7 @@ aesni_ctr32_encrypt_blocks: movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 - movl 12(%rcx),%r11d + movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 @@ -1061,8 +1060,8 @@ aesni_ctr32_encrypt_blocks: leaq 2(%r8),%rdx bswapl %eax bswapl %edx - xorl %r11d,%eax - xorl %r11d,%edx + xorl %ebp,%eax + xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) @@ -1071,25 +1070,26 @@ aesni_ctr32_encrypt_blocks: movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) - xorl %r11d,%eax + xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 - xorl %r11d,%r10d + xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax - xorl %r11d,%r9d + xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) - xorl %r11d,%r10d + xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d - movl OPENSSL_ia32cap_P+4(%rip),%r10d - xorl %r11d,%r9d + leaq OPENSSL_ia32cap_P(%rip),%r10 + movl 4(%r10),%r10d + xorl %ebp,%r9d andl $71303168,%r10d movl %r9d,112+12(%rsp) @@ -1113,7 +1113,7 @@ aesni_ctr32_encrypt_blocks: .Lctr32_6x: shll $4,%eax movl $48,%r10d - bswapl %r11d + bswapl %ebp leaq 32(%rcx,%rax,1),%rcx subq %rax,%r10 jmp .Lctr32_loop6 @@ -1124,32 +1124,32 @@ aesni_ctr32_encrypt_blocks: movups -48(%rcx,%r10,1),%xmm0 .byte 102,15,56,220,209 movl %r8d,%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,217 .byte 0x0f,0x38,0xf1,0x44,0x24,12 leal 1(%r8),%eax .byte 102,15,56,220,225 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,28 .byte 102,15,56,220,233 leal 2(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,241 .byte 0x0f,0x38,0xf1,0x44,0x24,44 leal 3(%r8),%eax .byte 102,15,56,220,249 movups -32(%rcx,%r10,1),%xmm1 - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,208 .byte 0x0f,0x38,0xf1,0x44,0x24,60 leal 4(%r8),%eax .byte 102,15,56,220,216 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,76 .byte 102,15,56,220,224 leal 5(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,232 .byte 0x0f,0x38,0xf1,0x44,0x24,92 movq %r10,%rax @@ -1210,7 +1210,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - xorl %r11d,%r9d + xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) @@ -1223,7 +1223,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1237,7 +1237,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1251,7 +1251,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1265,7 +1265,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1279,7 +1279,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1293,7 +1293,7 @@ aesni_ctr32_encrypt_blocks: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1308,7 +1308,7 @@ aesni_ctr32_encrypt_blocks: .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 - xorl %r11d,%r9d + xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) @@ -1543,7 +1543,7 @@ aesni_ctr32_encrypt_blocks: .Lctr32_done: xorps %xmm0,%xmm0 - xorl %r11d,%r11d + xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 @@ -1567,8 +1567,8 @@ aesni_ctr32_encrypt_blocks: pxor %xmm14,%xmm14 movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp .Lctr32_epilogue: .byte 0xf3,0xc3 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks @@ -1577,11 +1577,10 @@ aesni_ctr32_encrypt_blocks: .type aesni_xts_encrypt,@function .align 16 aesni_xts_encrypt: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1597,7 +1596,7 @@ aesni_xts_encrypt: jnz .Loop_enc1_8 .byte 102,15,56,221,209 movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -1653,9 +1652,9 @@ aesni_xts_encrypt: jc .Lxts_enc_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq .Lxts_magic(%rip),%r8 jmp .Lxts_enc_grandloop @@ -1680,7 +1679,7 @@ aesni_xts_encrypt: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,220,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -1689,7 +1688,7 @@ aesni_xts_encrypt: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,220,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,220,208 @@ -1704,7 +1703,7 @@ aesni_xts_encrypt: movdqa %xmm14,64(%rsp) .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_enc_loop6 @@ -1736,7 +1735,7 @@ aesni_xts_encrypt: psrad $31,%xmm14 .byte 102,15,56,220,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -1804,10 +1803,10 @@ aesni_xts_encrypt: .byte 102,15,56,220,225 .byte 102,15,56,220,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,221,84,36,0 @@ -1834,7 +1833,7 @@ aesni_xts_encrypt: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax .Lxts_enc_short: @@ -1990,7 +1989,7 @@ aesni_xts_encrypt: jnz .Lxts_enc_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups -16(%rsi),%xmm2 @@ -2033,8 +2032,8 @@ aesni_xts_encrypt: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp .Lxts_enc_epilogue: .byte 0xf3,0xc3 .size aesni_xts_encrypt,.-aesni_xts_encrypt @@ -2043,11 +2042,10 @@ aesni_xts_encrypt: .type aesni_xts_decrypt,@function .align 16 aesni_xts_decrypt: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -2069,7 +2067,7 @@ aesni_xts_decrypt: subq %rax,%rdx movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -2125,9 +2123,9 @@ aesni_xts_decrypt: jc .Lxts_dec_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq .Lxts_magic(%rip),%r8 jmp .Lxts_dec_grandloop @@ -2152,7 +2150,7 @@ aesni_xts_decrypt: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,222,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -2161,7 +2159,7 @@ aesni_xts_decrypt: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,222,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,222,208 @@ -2176,7 +2174,7 @@ aesni_xts_decrypt: movdqa %xmm14,64(%rsp) .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp .Lxts_dec_loop6 @@ -2208,7 +2206,7 @@ aesni_xts_decrypt: psrad $31,%xmm14 .byte 102,15,56,222,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -2276,10 +2274,10 @@ aesni_xts_decrypt: .byte 102,15,56,222,225 .byte 102,15,56,222,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,222,241 .byte 102,15,56,222,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,223,84,36,0 @@ -2306,7 +2304,7 @@ aesni_xts_decrypt: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax .Lxts_dec_short: @@ -2463,7 +2461,7 @@ aesni_xts_decrypt: jz .Lxts_dec_ret .Lxts_dec_done2: movq %r9,%rdx - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rdi),%xmm2 @@ -2493,7 +2491,7 @@ aesni_xts_decrypt: jnz .Lxts_dec_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rsi),%xmm2 @@ -2536,11 +2534,827 @@ aesni_xts_decrypt: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp .Lxts_dec_epilogue: .byte 0xf3,0xc3 .size aesni_xts_decrypt,.-aesni_xts_decrypt +.globl aesni_ocb_encrypt +.hidden aesni_ocb_encrypt +.type aesni_ocb_encrypt,@function +.align 32 +aesni_ocb_encrypt: + leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz .Locb_enc_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + leaq 16(%rsi),%rsi + subq $1,%rdx + jz .Locb_enc_done + +.Locb_enc_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc .Locb_enc_short + jmp .Locb_enc_grandloop + +.align 32 +.Locb_enc_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_encrypt6 + + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc .Locb_enc_grandloop + +.Locb_enc_short: + addq $6,%rdx + jz .Locb_enc_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb .Locb_enc_one + movdqu 16(%rdi),%xmm3 + je .Locb_enc_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb .Locb_enc_three + movdqu 48(%rdi),%xmm5 + je .Locb_enc_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_encrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_one: + movdqa %xmm10,%xmm7 + + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + jmp .Locb_enc_done + +.align 16 +.Locb_enc_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_three: + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + + jmp .Locb_enc_done + +.align 16 +.Locb_enc_four: + call __ocb_encrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + +.Locb_enc_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Locb_enc_epilogue: + .byte 0xf3,0xc3 +.size aesni_ocb_encrypt,.-aesni_ocb_encrypt + +.type __ocb_encrypt6,@function +.align 32 +__ocb_encrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm6,%xmm8 + pxor %xmm14,%xmm6 + pxor %xmm7,%xmm8 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp .Locb_enc_loop6 + +.align 32 +.Locb_enc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop6 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,221,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + .byte 0xf3,0xc3 +.size __ocb_encrypt6,.-__ocb_encrypt6 + +.type __ocb_encrypt4,@function +.align 32 +__ocb_encrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 64(%r11),%xmm0 + jmp .Locb_enc_loop4 + +.align 32 +.Locb_enc_loop4: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop4 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,221,210 +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + .byte 0xf3,0xc3 +.size __ocb_encrypt4,.-__ocb_encrypt4 + +.type __ocb_encrypt1,@function +.align 32 +__ocb_encrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm2,%xmm8 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,220,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,220,208 + movups 64(%r11),%xmm0 + jmp .Locb_enc_loop1 + +.align 32 +.Locb_enc_loop1: +.byte 102,15,56,220,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_enc_loop1 + +.byte 102,15,56,220,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,221,215 + .byte 0xf3,0xc3 +.size __ocb_encrypt1,.-__ocb_encrypt1 + +.globl aesni_ocb_decrypt +.hidden aesni_ocb_decrypt +.type aesni_ocb_decrypt,@function +.align 32 +aesni_ocb_decrypt: + leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz .Locb_dec_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm8 + leaq 16(%rsi),%rsi + subq $1,%rdx + jz .Locb_dec_done + +.Locb_dec_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc .Locb_dec_short + jmp .Locb_dec_grandloop + +.align 32 +.Locb_dec_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_decrypt6 + + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm8 + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc .Locb_dec_grandloop + +.Locb_dec_short: + addq $6,%rdx + jz .Locb_dec_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb .Locb_dec_one + movdqu 16(%rdi),%xmm3 + je .Locb_dec_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb .Locb_dec_three + movdqu 48(%rdi),%xmm5 + je .Locb_dec_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_decrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_one: + movdqa %xmm10,%xmm7 + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + jmp .Locb_dec_done + +.align 16 +.Locb_dec_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_three: + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + movups %xmm4,32(%rsi) + xorps %xmm4,%xmm8 + + jmp .Locb_dec_done + +.align 16 +.Locb_dec_four: + call __ocb_decrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + +.Locb_dec_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +.Locb_dec_epilogue: + .byte 0xf3,0xc3 +.size aesni_ocb_decrypt,.-aesni_ocb_decrypt + +.type __ocb_decrypt6,@function +.align 32 +__ocb_decrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp .Locb_dec_loop6 + +.align 32 +.Locb_dec_loop6: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop6 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,223,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 +.byte 102,65,15,56,223,255 + .byte 0xf3,0xc3 +.size __ocb_decrypt6,.-__ocb_decrypt6 + +.type __ocb_decrypt4,@function +.align 32 +__ocb_decrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 64(%r11),%xmm0 + jmp .Locb_dec_loop4 + +.align 32 +.Locb_dec_loop4: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop4 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,223,210 +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 + .byte 0xf3,0xc3 +.size __ocb_decrypt4,.-__ocb_decrypt4 + +.type __ocb_decrypt1,@function +.align 32 +__ocb_decrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,222,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,222,208 + movups 64(%r11),%xmm0 + jmp .Locb_dec_loop1 + +.align 32 +.Locb_dec_loop1: +.byte 102,15,56,222,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Locb_dec_loop1 + +.byte 102,15,56,222,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,223,215 + .byte 0xf3,0xc3 +.size __ocb_decrypt1,.-__ocb_decrypt1 .globl aesni_cbc_encrypt .hidden aesni_cbc_encrypt .type aesni_cbc_encrypt,@function @@ -2638,11 +3452,11 @@ aesni_cbc_encrypt: jmp .Lcbc_ret .align 16 .Lcbc_decrypt_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $16,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp + movq %rcx,%rbp movups (%r8),%xmm10 movl %r10d,%eax cmpq $0x50,%rdx @@ -2660,7 +3474,8 @@ aesni_cbc_encrypt: movdqa %xmm5,%xmm14 movdqu 80(%rdi),%xmm7 movdqa %xmm6,%xmm15 - movl OPENSSL_ia32cap_P+4(%rip),%r9d + leaq OPENSSL_ia32cap_P(%rip),%r9 + movl 4(%r9),%r9d cmpq $0x70,%rdx jbe .Lcbc_dec_six_or_seven @@ -2682,7 +3497,7 @@ aesni_cbc_encrypt: pxor %xmm0,%xmm3 movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 - xorq %r11,%r11 + movq $-1,%rbp cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 @@ -2698,10 +3513,10 @@ aesni_cbc_encrypt: .byte 102,15,56,222,241 .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 - setnc %r11b - shlq $7,%r11 + adcq $0,%rbp + andq $128,%rbp .byte 102,68,15,56,222,201 - addq %rdi,%r11 + addq %rdi,%rbp movups 48-112(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 @@ -2839,18 +3654,18 @@ aesni_cbc_encrypt: movdqu 112(%rdi),%xmm0 .byte 102,65,15,56,223,228 leaq 128(%rdi),%rdi - movdqu 0(%r11),%xmm11 + movdqu 0(%rbp),%xmm11 .byte 102,65,15,56,223,237 .byte 102,65,15,56,223,246 - movdqu 16(%r11),%xmm12 - movdqu 32(%r11),%xmm13 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 .byte 102,65,15,56,223,255 .byte 102,68,15,56,223,193 - movdqu 48(%r11),%xmm14 - movdqu 64(%r11),%xmm15 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 .byte 102,69,15,56,223,202 movdqa %xmm0,%xmm10 - movdqu 80(%r11),%xmm1 + movdqu 80(%rbp),%xmm1 movups -112(%rcx),%xmm0 movups %xmm2,(%rsi) @@ -2969,7 +3784,7 @@ aesni_cbc_encrypt: pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm14,%xmm6 - movq %r11,%rcx + movq %rbp,%rcx movdqu %xmm5,48(%rsi) pxor %xmm15,%xmm7 movl %r10d,%eax @@ -3122,8 +3937,8 @@ aesni_cbc_encrypt: .Lcbc_dec_ret: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp .Lcbc_ret: .byte 0xf3,0xc3 .size aesni_cbc_encrypt,.-aesni_cbc_encrypt @@ -3181,10 +3996,11 @@ __aesni_set_encrypt_key: testq %rdx,%rdx jz .Lenc_key_ret - movl $268437504,%r10d movups (%rdi),%xmm0 xorps %xmm4,%xmm4 - andl OPENSSL_ia32cap_P+4(%rip),%r10d + leaq OPENSSL_ia32cap_P(%rip),%r10 + movl 4(%r10),%r10d + andl $268437504,%r10d leaq 16(%rdx),%rax cmpl $256,%esi je .L14rounds diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/aes/bsaes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S similarity index 98% rename from packager/third_party/boringssl/linux-x86_64/crypto/aes/bsaes-x86_64.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S index c5491ce4d0..04b161c995 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/aes/bsaes-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/bsaes-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .extern asm_AES_encrypt @@ -1305,15 +1305,14 @@ bsaes_cbc_encrypt: cmpq %rax,%rbp ja .Lcbc_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp .Lcbc_dec_epilogue: .byte 0xf3,0xc3 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt @@ -1506,15 +1505,14 @@ bsaes_ctr32_encrypt_blocks: cmpq %rax,%rbp ja .Lctr_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp .Lctr_enc_epilogue: .byte 0xf3,0xc3 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks @@ -1958,15 +1956,14 @@ bsaes_xts_encrypt: cmpq %rax,%rbp ja .Lxts_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp .Lxts_enc_epilogue: .byte 0xf3,0xc3 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt @@ -2437,15 +2434,14 @@ bsaes_xts_decrypt: cmpq %rax,%rbp ja .Lxts_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp .Lxts_dec_epilogue: .byte 0xf3,0xc3 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/modes/ghash-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S similarity index 67% rename from packager/third_party/boringssl/linux-x86_64/crypto/modes/ghash-x86_64.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S index b47bdc9bd9..64ef2c2db2 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/modes/ghash-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -11,6 +11,10 @@ gcm_gmult_4bit: pushq %rbx pushq %rbp pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp .Lgmult_prologue: movzbq 15(%rdi),%r8 @@ -87,8 +91,9 @@ gcm_gmult_4bit: movq %r8,8(%rdi) movq %r9,(%rdi) - movq 16(%rsp),%rbx - leaq 24(%rsp),%rsp + leaq 280+48(%rsp),%rsi + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lgmult_epilogue: .byte 0xf3,0xc3 .size gcm_gmult_4bit,.-gcm_gmult_4bit @@ -648,14 +653,14 @@ gcm_ghash_4bit: movq %r8,8(%rdi) movq %r9,(%rdi) - leaq 280(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + leaq 280+48(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq 0(%rsi),%rsp .Lghash_epilogue: .byte 0xf3,0xc3 .size gcm_ghash_4bit,.-gcm_ghash_4bit @@ -885,7 +890,8 @@ gcm_ghash_clmul: jz .Lodd_tail movdqu 16(%rsi),%xmm6 - movl OPENSSL_ia32cap_P+4(%rip),%eax + leaq OPENSSL_ia32cap_P(%rip),%rax + movl 4(%rax),%eax cmpq $0x30,%rcx jb .Lskip4x @@ -1257,7 +1263,108 @@ gcm_ghash_clmul: .type gcm_init_avx,@function .align 32 gcm_init_avx: - jmp .L_init_clmul + vzeroupper + + vmovdqu (%rsi),%xmm2 + vpshufd $78,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp .Linit_start_avx +.align 32 +.Linit_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +.Linit_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz .Linit_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + .byte 0xf3,0xc3 .size gcm_init_avx,.-gcm_init_avx .globl gcm_gmult_avx .hidden gcm_gmult_avx @@ -1271,7 +1378,377 @@ gcm_gmult_avx: .type gcm_ghash_avx,@function .align 32 gcm_ghash_avx: - jmp .L_ghash_clmul + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq .L0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu .Lbswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb .Lshort_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb .Ltail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc .Loop8x_avx + + addq $0x80,%rcx + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor %xmm10,%xmm15,%xmm15 +.Ltail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne .Lshort_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + .byte 0xf3,0xc3 .size gcm_ghash_avx,.-gcm_ghash_avx .align 64 .Lbswap_mask: diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/md5/md5-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S similarity index 99% rename from packager/third_party/boringssl/linux-x86_64/crypto/md5/md5-x86_64.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S index 05369e2a77..8af65047aa 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/md5/md5-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .align 16 diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/ec/p256-x86_64-asm.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S similarity index 89% rename from packager/third_party/boringssl/linux-x86_64/crypto/ec/p256-x86_64-asm.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S index 4abce6f91e..6d21888f04 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/ec/p256-x86_64-asm.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -17,47 +17,6 @@ .LONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe -.type ecp_nistz256_mul_by_2,@function -.align 64 -ecp_nistz256_mul_by_2: - pushq %r12 - pushq %r13 - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - addq %r8,%r8 - movq 16(%rsi),%r10 - adcq %r9,%r9 - movq 24(%rsi),%r11 - leaq .Lpoly(%rip),%rsi - movq %r8,%rax - adcq %r10,%r10 - adcq %r11,%r11 - movq %r9,%rdx - sbbq %r13,%r13 - - subq 0(%rsi),%r8 - movq %r10,%rcx - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - movq %r11,%r12 - sbbq 24(%rsi),%r11 - testq %r13,%r13 - - cmovzq %rax,%r8 - cmovzq %rdx,%r9 - movq %r8,0(%rdi) - cmovzq %rcx,%r10 - movq %r9,8(%rdi) - cmovzq %r12,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - popq %r13 - popq %r12 - .byte 0xf3,0xc3 -.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 - .globl ecp_nistz256_neg @@ -553,106 +512,15 @@ __ecp_nistz256_sqr_montq: .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq - - - - -.globl ecp_nistz256_from_mont -.hidden ecp_nistz256_from_mont -.type ecp_nistz256_from_mont,@function -.align 32 -ecp_nistz256_from_mont: - pushq %r12 - pushq %r13 - - movq 0(%rsi),%rax - movq .Lpoly+24(%rip),%r13 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq %rax,%r8 - movq .Lpoly+8(%rip),%r12 - - - - movq %rax,%rcx - shlq $32,%r8 - mulq %r13 - shrq $32,%rcx - addq %r8,%r9 - adcq %rcx,%r10 - adcq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - - - - movq %r9,%rcx - shlq $32,%r9 - movq %rdx,%r8 - mulq %r13 - shrq $32,%rcx - addq %r9,%r10 - adcq %rcx,%r11 - adcq %rax,%r8 - movq %r10,%rax - adcq $0,%rdx - - - - movq %r10,%rcx - shlq $32,%r10 - movq %rdx,%r9 - mulq %r13 - shrq $32,%rcx - addq %r10,%r11 - adcq %rcx,%r8 - adcq %rax,%r9 - movq %r11,%rax - adcq $0,%rdx - - - - movq %r11,%rcx - shlq $32,%r11 - movq %rdx,%r10 - mulq %r13 - shrq $32,%rcx - addq %r11,%r8 - adcq %rcx,%r9 - movq %r8,%rcx - adcq %rax,%r10 - movq %r9,%rsi - adcq $0,%rdx - - subq $-1,%r8 - movq %r10,%rax - sbbq %r12,%r9 - sbbq $0,%r10 - movq %rdx,%r11 - sbbq %r13,%rdx - sbbq %r13,%r13 - - cmovnzq %rcx,%r8 - cmovnzq %rsi,%r9 - movq %r8,0(%rdi) - cmovnzq %rax,%r10 - movq %r9,8(%rdi) - cmovzq %rdx,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - popq %r13 - popq %r12 - .byte 0xf3,0xc3 -.size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont - - .globl ecp_nistz256_select_w5 .hidden ecp_nistz256_select_w5 .type ecp_nistz256_select_w5,@function .align 32 ecp_nistz256_select_w5: + leaq OPENSSL_ia32cap_P(%rip),%rax + movq 8(%rax),%rax + testl $32,%eax + jnz .Lavx2_select_w5 movdqa .LOne(%rip),%xmm0 movd %edx,%xmm1 @@ -713,6 +581,10 @@ ecp_nistz256_select_w5: .type ecp_nistz256_select_w7,@function .align 32 ecp_nistz256_select_w7: + leaq OPENSSL_ia32cap_P(%rip),%rax + movq 8(%rax),%rax + testl $32,%eax + jnz .Lavx2_select_w7 movdqa .LOne(%rip),%xmm8 movd %edx,%xmm1 @@ -754,24 +626,155 @@ ecp_nistz256_select_w7: movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 .size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 + + +.type ecp_nistz256_avx2_select_w5,@function +.align 32 +ecp_nistz256_avx2_select_w5: +.Lavx2_select_w5: + vzeroupper + vmovdqa .LTwo(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + vpxor %ymm4,%ymm4,%ymm4 + + vmovdqa .LOne(%rip),%ymm5 + vmovdqa .LTwo(%rip),%ymm10 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + movq $8,%rax +.Lselect_loop_avx2_w5: + + vmovdqa 0(%rsi),%ymm6 + vmovdqa 32(%rsi),%ymm7 + vmovdqa 64(%rsi),%ymm8 + + vmovdqa 96(%rsi),%ymm11 + vmovdqa 128(%rsi),%ymm12 + vmovdqa 160(%rsi),%ymm13 + + vpcmpeqd %ymm1,%ymm5,%ymm9 + vpcmpeqd %ymm1,%ymm10,%ymm14 + + vpaddd %ymm0,%ymm5,%ymm5 + vpaddd %ymm0,%ymm10,%ymm10 + leaq 192(%rsi),%rsi + + vpand %ymm9,%ymm6,%ymm6 + vpand %ymm9,%ymm7,%ymm7 + vpand %ymm9,%ymm8,%ymm8 + vpand %ymm14,%ymm11,%ymm11 + vpand %ymm14,%ymm12,%ymm12 + vpand %ymm14,%ymm13,%ymm13 + + vpxor %ymm6,%ymm2,%ymm2 + vpxor %ymm7,%ymm3,%ymm3 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm11,%ymm2,%ymm2 + vpxor %ymm12,%ymm3,%ymm3 + vpxor %ymm13,%ymm4,%ymm4 + + decq %rax + jnz .Lselect_loop_avx2_w5 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,64(%rdi) + vzeroupper + .byte 0xf3,0xc3 +.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 + + + .globl ecp_nistz256_avx2_select_w7 .hidden ecp_nistz256_avx2_select_w7 .type ecp_nistz256_avx2_select_w7,@function .align 32 ecp_nistz256_avx2_select_w7: -.byte 0x0f,0x0b +.Lavx2_select_w7: + vzeroupper + vmovdqa .LThree(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + + vmovdqa .LOne(%rip),%ymm4 + vmovdqa .LTwo(%rip),%ymm8 + vmovdqa .LThree(%rip),%ymm12 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + + movq $21,%rax +.Lselect_loop_avx2_w7: + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vmovdqa 64(%rsi),%ymm9 + vmovdqa 96(%rsi),%ymm10 + + vmovdqa 128(%rsi),%ymm13 + vmovdqa 160(%rsi),%ymm14 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + vpcmpeqd %ymm1,%ymm8,%ymm11 + vpcmpeqd %ymm1,%ymm12,%ymm15 + + vpaddd %ymm0,%ymm4,%ymm4 + vpaddd %ymm0,%ymm8,%ymm8 + vpaddd %ymm0,%ymm12,%ymm12 + leaq 192(%rsi),%rsi + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm11,%ymm9,%ymm9 + vpand %ymm11,%ymm10,%ymm10 + vpand %ymm15,%ymm13,%ymm13 + vpand %ymm15,%ymm14,%ymm14 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm13,%ymm2,%ymm2 + vpxor %ymm14,%ymm3,%ymm3 + + decq %rax + jnz .Lselect_loop_avx2_w7 + + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vzeroupper .byte 0xf3,0xc3 .size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 .type __ecp_nistz256_add_toq,@function .align 32 __ecp_nistz256_add_toq: + xorq %r11,%r11 addq 0(%rbx),%r12 adcq 8(%rbx),%r13 movq %r12,%rax adcq 16(%rbx),%r8 adcq 24(%rbx),%r9 movq %r13,%rbp - sbbq %r11,%r11 + adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx @@ -779,14 +782,14 @@ __ecp_nistz256_add_toq: sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 - testq %r11,%r11 + sbbq $0,%r11 - cmovzq %rax,%r12 - cmovzq %rbp,%r13 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 movq %r12,0(%rdi) - cmovzq %rcx,%r8 + cmovcq %rcx,%r8 movq %r13,8(%rdi) - cmovzq %r10,%r9 + cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) @@ -854,13 +857,14 @@ __ecp_nistz256_subq: .type __ecp_nistz256_mul_by_2q,@function .align 32 __ecp_nistz256_mul_by_2q: + xorq %r11,%r11 addq %r12,%r12 adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp - sbbq %r11,%r11 + adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx @@ -868,14 +872,14 @@ __ecp_nistz256_mul_by_2q: sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 - testq %r11,%r11 + sbbq $0,%r11 - cmovzq %rax,%r12 - cmovzq %rbp,%r13 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 movq %r12,0(%rdi) - cmovzq %rcx,%r8 + cmovcq %rcx,%r8 movq %r13,8(%rdi) - cmovzq %r10,%r9 + cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) @@ -1107,16 +1111,14 @@ ecp_nistz256_point_add: movq %rdx,%rsi movdqa %xmm0,384(%rsp) movdqa %xmm1,384+16(%rsp) - por %xmm0,%xmm1 movdqa %xmm2,416(%rsp) movdqa %xmm3,416+16(%rsp) - por %xmm2,%xmm3 movdqa %xmm4,448(%rsp) movdqa %xmm5,448+16(%rsp) - por %xmm1,%xmm3 + por %xmm4,%xmm5 movdqu 0(%rsi),%xmm0 - pshufd $0xb1,%xmm3,%xmm5 + pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 @@ -1128,14 +1130,14 @@ ecp_nistz256_point_add: movdqa %xmm0,480(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 movdqa %xmm2,512(%rsp) movdqa %xmm3,512+16(%rsp) - por %xmm2,%xmm3 por %xmm4,%xmm5 pxor %xmm4,%xmm4 - por %xmm1,%xmm3 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 leaq 64-0(%rsi),%rsi movq %rax,544+0(%rsp) @@ -1146,8 +1148,8 @@ ecp_nistz256_point_add: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm3,%xmm4 - por %xmm3,%xmm4 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 @@ -1330,6 +1332,7 @@ ecp_nistz256_point_add: + xorq %r11,%r11 addq %r12,%r12 leaq 96(%rsp),%rsi adcq %r13,%r13 @@ -1337,7 +1340,7 @@ ecp_nistz256_point_add: adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp - sbbq %r11,%r11 + adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx @@ -1345,15 +1348,15 @@ ecp_nistz256_point_add: sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 - testq %r11,%r11 + sbbq $0,%r11 - cmovzq %rax,%r12 + cmovcq %rax,%r12 movq 0(%rsi),%rax - cmovzq %rbp,%r13 + cmovcq %rbp,%r13 movq 8(%rsi),%rbp - cmovzq %rcx,%r8 + cmovcq %rcx,%r8 movq 16(%rsi),%rcx - cmovzq %r10,%r9 + cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subq @@ -1508,16 +1511,14 @@ ecp_nistz256_point_add_affine: movq 64+24(%rsi),%r8 movdqa %xmm0,320(%rsp) movdqa %xmm1,320+16(%rsp) - por %xmm0,%xmm1 movdqa %xmm2,352(%rsp) movdqa %xmm3,352+16(%rsp) - por %xmm2,%xmm3 movdqa %xmm4,384(%rsp) movdqa %xmm5,384+16(%rsp) - por %xmm1,%xmm3 + por %xmm4,%xmm5 movdqu 0(%rbx),%xmm0 - pshufd $0xb1,%xmm3,%xmm5 + pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 @@ -1635,6 +1636,7 @@ ecp_nistz256_point_add_affine: + xorq %r11,%r11 addq %r12,%r12 leaq 192(%rsp),%rsi adcq %r13,%r13 @@ -1642,7 +1644,7 @@ ecp_nistz256_point_add_affine: adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp - sbbq %r11,%r11 + adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx @@ -1650,15 +1652,15 @@ ecp_nistz256_point_add_affine: sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 - testq %r11,%r11 + sbbq $0,%r11 - cmovzq %rax,%r12 + cmovcq %rax,%r12 movq 0(%rsi),%rax - cmovzq %rbp,%r13 + cmovcq %rbp,%r13 movq 8(%rsi),%rbp - cmovzq %rcx,%r8 + cmovcq %rcx,%r8 movq 16(%rsi),%rcx - cmovzq %r10,%r9 + cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subq diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/rand/rdrand-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S similarity index 92% rename from packager/third_party/boringssl/linux-x86_64/crypto/rand/rdrand-x86_64.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S index 94aab9c19b..7c1eeb7211 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/rand/rdrand-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S new file mode 100644 index 0000000000..bc3440d55c --- /dev/null +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S @@ -0,0 +1,1743 @@ +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +.text + +.globl rsaz_1024_sqr_avx2 +.hidden rsaz_1024_sqr_avx2 +.type rsaz_1024_sqr_avx2,@function +.align 64 +rsaz_1024_sqr_avx2: +.cfi_startproc + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + movq %rax,%rbp +.cfi_def_cfa_register %rbp + movq %rdx,%r13 + subq $832,%rsp + movq %r13,%r15 + subq $-128,%rdi + subq $-128,%rsi + subq $-128,%r13 + + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + vpxor %ymm9,%ymm9,%ymm9 + jz .Lsqr_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%r13),%ymm0 + andq $-2048,%rsp + vmovdqu 32-128(%r13),%ymm1 + vmovdqu 64-128(%r13),%ymm2 + vmovdqu 96-128(%r13),%ymm3 + vmovdqu 128-128(%r13),%ymm4 + vmovdqu 160-128(%r13),%ymm5 + vmovdqu 192-128(%r13),%ymm6 + vmovdqu 224-128(%r13),%ymm7 + vmovdqu 256-128(%r13),%ymm8 + leaq 832+128(%rsp),%r13 + vmovdqu %ymm0,0-128(%r13) + vmovdqu %ymm1,32-128(%r13) + vmovdqu %ymm2,64-128(%r13) + vmovdqu %ymm3,96-128(%r13) + vmovdqu %ymm4,128-128(%r13) + vmovdqu %ymm5,160-128(%r13) + vmovdqu %ymm6,192-128(%r13) + vmovdqu %ymm7,224-128(%r13) + vmovdqu %ymm8,256-128(%r13) + vmovdqu %ymm9,288-128(%r13) + +.Lsqr_1024_no_n_copy: + andq $-1024,%rsp + + vmovdqu 32-128(%rsi),%ymm1 + vmovdqu 64-128(%rsi),%ymm2 + vmovdqu 96-128(%rsi),%ymm3 + vmovdqu 128-128(%rsi),%ymm4 + vmovdqu 160-128(%rsi),%ymm5 + vmovdqu 192-128(%rsi),%ymm6 + vmovdqu 224-128(%rsi),%ymm7 + vmovdqu 256-128(%rsi),%ymm8 + + leaq 192(%rsp),%rbx + vpbroadcastq .Land_mask(%rip),%ymm15 + jmp .LOOP_GRANDE_SQR_1024 + +.align 32 +.LOOP_GRANDE_SQR_1024: + leaq 576+128(%rsp),%r9 + leaq 448(%rsp),%r12 + + + + + vpaddq %ymm1,%ymm1,%ymm1 + vpbroadcastq 0-128(%rsi),%ymm10 + vpaddq %ymm2,%ymm2,%ymm2 + vmovdqa %ymm1,0-128(%r9) + vpaddq %ymm3,%ymm3,%ymm3 + vmovdqa %ymm2,32-128(%r9) + vpaddq %ymm4,%ymm4,%ymm4 + vmovdqa %ymm3,64-128(%r9) + vpaddq %ymm5,%ymm5,%ymm5 + vmovdqa %ymm4,96-128(%r9) + vpaddq %ymm6,%ymm6,%ymm6 + vmovdqa %ymm5,128-128(%r9) + vpaddq %ymm7,%ymm7,%ymm7 + vmovdqa %ymm6,160-128(%r9) + vpaddq %ymm8,%ymm8,%ymm8 + vmovdqa %ymm7,192-128(%r9) + vpxor %ymm9,%ymm9,%ymm9 + vmovdqa %ymm8,224-128(%r9) + + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpbroadcastq 32-128(%rsi),%ymm11 + vmovdqu %ymm9,288-192(%rbx) + vpmuludq %ymm10,%ymm1,%ymm1 + vmovdqu %ymm9,320-448(%r12) + vpmuludq %ymm10,%ymm2,%ymm2 + vmovdqu %ymm9,352-448(%r12) + vpmuludq %ymm10,%ymm3,%ymm3 + vmovdqu %ymm9,384-448(%r12) + vpmuludq %ymm10,%ymm4,%ymm4 + vmovdqu %ymm9,416-448(%r12) + vpmuludq %ymm10,%ymm5,%ymm5 + vmovdqu %ymm9,448-448(%r12) + vpmuludq %ymm10,%ymm6,%ymm6 + vmovdqu %ymm9,480-448(%r12) + vpmuludq %ymm10,%ymm7,%ymm7 + vmovdqu %ymm9,512-448(%r12) + vpmuludq %ymm10,%ymm8,%ymm8 + vpbroadcastq 64-128(%rsi),%ymm10 + vmovdqu %ymm9,544-448(%r12) + + movq %rsi,%r15 + movl $4,%r14d + jmp .Lsqr_entry_1024 +.align 32 +.LOOP_SQR_1024: + vpbroadcastq 32-128(%r15),%ymm11 + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpaddq 0-192(%rbx),%ymm0,%ymm0 + vpmuludq 0-128(%r9),%ymm10,%ymm1 + vpaddq 32-192(%rbx),%ymm1,%ymm1 + vpmuludq 32-128(%r9),%ymm10,%ymm2 + vpaddq 64-192(%rbx),%ymm2,%ymm2 + vpmuludq 64-128(%r9),%ymm10,%ymm3 + vpaddq 96-192(%rbx),%ymm3,%ymm3 + vpmuludq 96-128(%r9),%ymm10,%ymm4 + vpaddq 128-192(%rbx),%ymm4,%ymm4 + vpmuludq 128-128(%r9),%ymm10,%ymm5 + vpaddq 160-192(%rbx),%ymm5,%ymm5 + vpmuludq 160-128(%r9),%ymm10,%ymm6 + vpaddq 192-192(%rbx),%ymm6,%ymm6 + vpmuludq 192-128(%r9),%ymm10,%ymm7 + vpaddq 224-192(%rbx),%ymm7,%ymm7 + vpmuludq 224-128(%r9),%ymm10,%ymm8 + vpbroadcastq 64-128(%r15),%ymm10 + vpaddq 256-192(%rbx),%ymm8,%ymm8 +.Lsqr_entry_1024: + vmovdqu %ymm0,0-192(%rbx) + vmovdqu %ymm1,32-192(%rbx) + + vpmuludq 32-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 32-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 64-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 96-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 128-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 160-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 192-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 224-128(%r9),%ymm11,%ymm0 + vpbroadcastq 96-128(%r15),%ymm11 + vpaddq 288-192(%rbx),%ymm0,%ymm0 + + vmovdqu %ymm2,64-192(%rbx) + vmovdqu %ymm3,96-192(%rbx) + + vpmuludq 64-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 64-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 96-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 128-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 160-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 224-128(%r9),%ymm10,%ymm1 + vpbroadcastq 128-128(%r15),%ymm10 + vpaddq 320-448(%r12),%ymm1,%ymm1 + + vmovdqu %ymm4,128-192(%rbx) + vmovdqu %ymm5,160-192(%rbx) + + vpmuludq 96-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 96-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq 128-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm0,%ymm0 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq 224-128(%r9),%ymm11,%ymm2 + vpbroadcastq 160-128(%r15),%ymm11 + vpaddq 352-448(%r12),%ymm2,%ymm2 + + vmovdqu %ymm6,192-192(%rbx) + vmovdqu %ymm7,224-192(%rbx) + + vpmuludq 128-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 128-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 160-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 192-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 224-128(%r9),%ymm10,%ymm3 + vpbroadcastq 192-128(%r15),%ymm10 + vpaddq 384-448(%r12),%ymm3,%ymm3 + + vmovdqu %ymm8,256-192(%rbx) + vmovdqu %ymm0,288-192(%rbx) + leaq 8(%rbx),%rbx + + vpmuludq 160-128(%rsi),%ymm11,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 224-128(%r9),%ymm11,%ymm4 + vpbroadcastq 224-128(%r15),%ymm11 + vpaddq 416-448(%r12),%ymm4,%ymm4 + + vmovdqu %ymm1,320-448(%r12) + vmovdqu %ymm2,352-448(%r12) + + vpmuludq 192-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpbroadcastq 256-128(%r15),%ymm0 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq 224-128(%r9),%ymm10,%ymm5 + vpbroadcastq 0+8-128(%r15),%ymm10 + vpaddq 448-448(%r12),%ymm5,%ymm5 + + vmovdqu %ymm3,384-448(%r12) + vmovdqu %ymm4,416-448(%r12) + leaq 8(%r15),%r15 + + vpmuludq 224-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 224-128(%r9),%ymm11,%ymm6 + vpaddq 480-448(%r12),%ymm6,%ymm6 + + vpmuludq 256-128(%rsi),%ymm0,%ymm7 + vmovdqu %ymm5,448-448(%r12) + vpaddq 512-448(%r12),%ymm7,%ymm7 + vmovdqu %ymm6,480-448(%r12) + vmovdqu %ymm7,512-448(%r12) + leaq 8(%r12),%r12 + + decl %r14d + jnz .LOOP_SQR_1024 + + vmovdqu 256(%rsp),%ymm8 + vmovdqu 288(%rsp),%ymm1 + vmovdqu 320(%rsp),%ymm2 + leaq 192(%rsp),%rbx + + vpsrlq $29,%ymm8,%ymm14 + vpand %ymm15,%ymm8,%ymm8 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + + vpermq $0x93,%ymm14,%ymm14 + vpxor %ymm9,%ymm9,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm8,%ymm8 + vpblendd $3,%ymm11,%ymm9,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,288-192(%rbx) + vmovdqu %ymm2,320-192(%rbx) + + movq (%rsp),%rax + movq 8(%rsp),%r10 + movq 16(%rsp),%r11 + movq 24(%rsp),%r12 + vmovdqu 32(%rsp),%ymm1 + vmovdqu 64-192(%rbx),%ymm2 + vmovdqu 96-192(%rbx),%ymm3 + vmovdqu 128-192(%rbx),%ymm4 + vmovdqu 160-192(%rbx),%ymm5 + vmovdqu 192-192(%rbx),%ymm6 + vmovdqu 224-192(%rbx),%ymm7 + + movq %rax,%r9 + imull %ecx,%eax + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + + movq %rax,%rdx + imulq -128(%r13),%rax + vpbroadcastq %xmm12,%ymm12 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax + shrq $29,%r9 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + addq %r9,%r10 + addq %rax,%r11 + imulq 24-128(%r13),%rdx + addq %rdx,%r12 + + movq %r10,%rax + imull %ecx,%eax + andl $0x1fffffff,%eax + + movl $9,%r14d + jmp .LOOP_REDUCE_1024 + +.align 32 +.LOOP_REDUCE_1024: + vmovd %eax,%xmm13 + vpbroadcastq %xmm13,%ymm13 + + vpmuludq 32-128(%r13),%ymm12,%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm10,%ymm1,%ymm1 + addq %rax,%r10 + vpmuludq 64-128(%r13),%ymm12,%ymm14 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm14,%ymm2,%ymm2 + vpmuludq 96-128(%r13),%ymm12,%ymm11 +.byte 0x67 + addq %rax,%r11 +.byte 0x67 + movq %rdx,%rax + imulq 16-128(%r13),%rax + shrq $29,%r10 + vpaddq %ymm11,%ymm3,%ymm3 + vpmuludq 128-128(%r13),%ymm12,%ymm10 + addq %rax,%r12 + addq %r10,%r11 + vpaddq %ymm10,%ymm4,%ymm4 + vpmuludq 160-128(%r13),%ymm12,%ymm14 + movq %r11,%rax + imull %ecx,%eax + vpaddq %ymm14,%ymm5,%ymm5 + vpmuludq 192-128(%r13),%ymm12,%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm11,%ymm6,%ymm6 + vpmuludq 224-128(%r13),%ymm12,%ymm10 + vpaddq %ymm10,%ymm7,%ymm7 + vpmuludq 256-128(%r13),%ymm12,%ymm14 + vmovd %eax,%xmm12 + + vpaddq %ymm14,%ymm8,%ymm8 + + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 32-8-128(%r13),%ymm13,%ymm11 + vmovdqu 96-8-128(%r13),%ymm14 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm1,%ymm1 + vpmuludq 64-8-128(%r13),%ymm13,%ymm10 + vmovdqu 128-8-128(%r13),%ymm11 + addq %rax,%r11 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm10,%ymm2,%ymm2 + addq %r12,%rax + shrq $29,%r11 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 160-8-128(%r13),%ymm10 + addq %r11,%rax + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 192-8-128(%r13),%ymm14 +.byte 0x67 + movq %rax,%r12 + imull %ecx,%eax + vpaddq %ymm11,%ymm4,%ymm4 + vpmuludq %ymm13,%ymm10,%ymm10 +.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm5,%ymm5 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 256-8-128(%r13),%ymm10 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 288-8-128(%r13),%ymm9 + vmovd %eax,%xmm0 + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm7,%ymm7 + vpmuludq %ymm13,%ymm10,%ymm10 + vmovdqu 32-16-128(%r13),%ymm14 + vpbroadcastq %xmm0,%ymm0 + vpaddq %ymm10,%ymm8,%ymm8 + vpmuludq %ymm13,%ymm9,%ymm9 + vmovdqu 64-16-128(%r13),%ymm11 + addq %rax,%r12 + + vmovdqu 32-24-128(%r13),%ymm13 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 96-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq %ymm0,%ymm13,%ymm13 + vpmuludq %ymm12,%ymm11,%ymm11 +.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff + vpaddq %ymm1,%ymm13,%ymm13 + vpaddq %ymm11,%ymm2,%ymm2 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 160-16-128(%r13),%ymm11 +.byte 0x67 + vmovq %xmm13,%rax + vmovdqu %ymm13,(%rsp) + vpaddq %ymm10,%ymm3,%ymm3 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 192-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq %ymm12,%ymm11,%ymm11 + vmovdqu 224-16-128(%r13),%ymm14 + vpaddq %ymm11,%ymm5,%ymm5 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 256-16-128(%r13),%ymm11 + vpaddq %ymm10,%ymm6,%ymm6 + vpmuludq %ymm12,%ymm14,%ymm14 + shrq $29,%r12 + vmovdqu 288-16-128(%r13),%ymm10 + addq %r12,%rax + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq %ymm12,%ymm11,%ymm11 + + movq %rax,%r9 + imull %ecx,%eax + vpaddq %ymm11,%ymm8,%ymm8 + vpmuludq %ymm12,%ymm10,%ymm10 + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + vmovdqu 96-24-128(%r13),%ymm11 +.byte 0x67 + vpaddq %ymm10,%ymm9,%ymm9 + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 64-24-128(%r13),%ymm0,%ymm14 + vmovdqu 128-24-128(%r13),%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + movq 8(%rsp),%r10 + vpaddq %ymm14,%ymm2,%ymm1 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 160-24-128(%r13),%ymm14 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax +.byte 0x67 + shrq $29,%r9 + movq 16(%rsp),%r11 + vpaddq %ymm11,%ymm3,%ymm2 + vpmuludq %ymm0,%ymm10,%ymm10 + vmovdqu 192-24-128(%r13),%ymm11 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + vpaddq %ymm10,%ymm4,%ymm3 + vpmuludq %ymm0,%ymm14,%ymm14 + vmovdqu 224-24-128(%r13),%ymm10 + imulq 24-128(%r13),%rdx + addq %rax,%r11 + leaq (%r9,%r10,1),%rax + vpaddq %ymm14,%ymm5,%ymm4 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 256-24-128(%r13),%ymm14 + movq %rax,%r10 + imull %ecx,%eax + vpmuludq %ymm0,%ymm10,%ymm10 + vpaddq %ymm11,%ymm6,%ymm5 + vmovdqu 288-24-128(%r13),%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm7,%ymm6 + vpmuludq %ymm0,%ymm14,%ymm14 + addq 24(%rsp),%rdx + vpaddq %ymm14,%ymm8,%ymm7 + vpmuludq %ymm0,%ymm11,%ymm11 + vpaddq %ymm11,%ymm9,%ymm8 + vmovq %r12,%xmm9 + movq %rdx,%r12 + + decl %r14d + jnz .LOOP_REDUCE_1024 + leaq 448(%rsp),%r12 + vpaddq %ymm9,%ymm13,%ymm0 + vpxor %ymm9,%ymm9,%ymm9 + + vpaddq 288-192(%rbx),%ymm0,%ymm0 + vpaddq 320-448(%r12),%ymm1,%ymm1 + vpaddq 352-448(%r12),%ymm2,%ymm2 + vpaddq 384-448(%r12),%ymm3,%ymm3 + vpaddq 416-448(%r12),%ymm4,%ymm4 + vpaddq 448-448(%r12),%ymm5,%ymm5 + vpaddq 480-448(%r12),%ymm6,%ymm6 + vpaddq 512-448(%r12),%ymm7,%ymm7 + vpaddq 544-448(%r12),%ymm8,%ymm8 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vpaddq %ymm13,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vmovdqu %ymm0,0-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,32-128(%rdi) + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vmovdqu %ymm2,64-128(%rdi) + vpaddq %ymm13,%ymm4,%ymm4 + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vpaddq %ymm13,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vmovdqu %ymm4,128-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vmovdqu %ymm5,160-128(%rdi) + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vmovdqu %ymm6,192-128(%rdi) + vpaddq %ymm13,%ymm8,%ymm8 + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + + movq %rdi,%rsi + decl %r8d + jne .LOOP_GRANDE_SQR_1024 + + vzeroall + movq %rbp,%rax +.cfi_def_cfa_register %rax + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lsqr_1024_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 +.globl rsaz_1024_mul_avx2 +.hidden rsaz_1024_mul_avx2 +.type rsaz_1024_mul_avx2,@function +.align 64 +rsaz_1024_mul_avx2: +.cfi_startproc + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + movq %rax,%rbp +.cfi_def_cfa_register %rbp + vzeroall + movq %rdx,%r13 + subq $64,%rsp + + + + + + +.byte 0x67,0x67 + movq %rsi,%r15 + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + movq %rsi,%r15 + cmovnzq %r13,%rsi + cmovnzq %r15,%r13 + + movq %rcx,%r15 + subq $-128,%rsi + subq $-128,%rcx + subq $-128,%rdi + + andq $4095,%r15 + addq $320,%r15 +.byte 0x67,0x67 + shrq $12,%r15 + jz .Lmul_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%rcx),%ymm0 + andq $-512,%rsp + vmovdqu 32-128(%rcx),%ymm1 + vmovdqu 64-128(%rcx),%ymm2 + vmovdqu 96-128(%rcx),%ymm3 + vmovdqu 128-128(%rcx),%ymm4 + vmovdqu 160-128(%rcx),%ymm5 + vmovdqu 192-128(%rcx),%ymm6 + vmovdqu 224-128(%rcx),%ymm7 + vmovdqu 256-128(%rcx),%ymm8 + leaq 64+128(%rsp),%rcx + vmovdqu %ymm0,0-128(%rcx) + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm1,32-128(%rcx) + vpxor %ymm1,%ymm1,%ymm1 + vmovdqu %ymm2,64-128(%rcx) + vpxor %ymm2,%ymm2,%ymm2 + vmovdqu %ymm3,96-128(%rcx) + vpxor %ymm3,%ymm3,%ymm3 + vmovdqu %ymm4,128-128(%rcx) + vpxor %ymm4,%ymm4,%ymm4 + vmovdqu %ymm5,160-128(%rcx) + vpxor %ymm5,%ymm5,%ymm5 + vmovdqu %ymm6,192-128(%rcx) + vpxor %ymm6,%ymm6,%ymm6 + vmovdqu %ymm7,224-128(%rcx) + vpxor %ymm7,%ymm7,%ymm7 + vmovdqu %ymm8,256-128(%rcx) + vmovdqa %ymm0,%ymm8 + vmovdqu %ymm9,288-128(%rcx) +.Lmul_1024_no_n_copy: + andq $-64,%rsp + + movq (%r13),%rbx + vpbroadcastq (%r13),%ymm10 + vmovdqu %ymm0,(%rsp) + xorq %r9,%r9 +.byte 0x67 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + + vmovdqu .Land_mask(%rip),%ymm15 + movl $9,%r14d + vmovdqu %ymm9,288-128(%rdi) + jmp .Loop_mul_1024 + +.align 32 +.Loop_mul_1024: + vpsrlq $29,%ymm3,%ymm9 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r9,%rax + movq %rbx,%r10 + imulq 8-128(%rsi),%r10 + addq 8(%rsp),%r10 + + movq %rax,%r9 + imull %r8d,%eax + andl $0x1fffffff,%eax + + movq %rbx,%r11 + imulq 16-128(%rsi),%r11 + addq 16(%rsp),%r11 + + movq %rbx,%r12 + imulq 24-128(%rsi),%r12 + addq 24(%rsp),%r12 + vpmuludq 32-128(%rsi),%ymm10,%ymm0 + vmovd %eax,%xmm11 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq 64-128(%rsi),%ymm10,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 96-128(%rsi),%ymm10,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq 128-128(%rsi),%ymm10,%ymm0 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq 160-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 192-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq 224-128(%rsi),%ymm10,%ymm0 + vpermq $0x93,%ymm9,%ymm9 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq 256-128(%rsi),%ymm10,%ymm12 + vpbroadcastq 8(%r13),%ymm10 + vpaddq %ymm12,%ymm8,%ymm8 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%rcx),%rax + addq %rax,%r11 + shrq $29,%r9 + imulq 24-128(%rcx),%rdx + addq %rdx,%r12 + addq %r9,%r10 + + vpmuludq 32-128(%rcx),%ymm11,%ymm13 + vmovq %xmm10,%rbx + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 64-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm2,%ymm2 + vpmuludq 96-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 128-128(%rcx),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 160-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm5,%ymm5 + vpmuludq 192-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 224-128(%rcx),%ymm11,%ymm13 + vpblendd $3,%ymm14,%ymm9,%ymm9 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 256-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm9,%ymm3,%ymm3 + vpaddq %ymm0,%ymm8,%ymm8 + + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rsi),%ymm12 + movq %rbx,%rax + imulq 8-128(%rsi),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rsi),%ymm13 + + movq %r10,%rax + imull %r8d,%eax + andl $0x1fffffff,%eax + + imulq 16-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovd %eax,%xmm11 + vmovdqu -8+96-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -8+128-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+160-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+192-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -8+224-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+256-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+288-128(%rsi),%ymm9 + vpaddq %ymm12,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm13,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm9,%ymm9 + vpbroadcastq 16(%r13),%ymm10 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rcx),%ymm0 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rcx),%ymm12 + shrq $29,%r10 + imulq 16-128(%rcx),%rdx + addq %rdx,%r12 + addq %r10,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -8+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rsi),%ymm0 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r11,%rax + + vmovdqu -16+64-128(%rsi),%ymm12 + movq %rax,%r11 + imull %r8d,%eax + andl $0x1fffffff,%eax + + imulq 8-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -16+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -16+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -16+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 24(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rcx),%ymm0 + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r11 + vmovdqu -16+64-128(%rcx),%ymm12 + imulq 8-128(%rcx),%rdx + addq %rdx,%r12 + shrq $29,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -16+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+32-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+64-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm9,%ymm9 + + addq %r11,%r12 + imulq -128(%rsi),%rbx + addq %rbx,%r12 + + movq %r12,%rax + imull %r8d,%eax + andl $0x1fffffff,%eax + + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -24+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -24+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -24+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 32(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + addq $32,%r13 + + vmovdqu -24+32-128(%rcx),%ymm0 + imulq -128(%rcx),%rax + addq %rax,%r12 + shrq $29,%r12 + + vmovdqu -24+64-128(%rcx),%ymm12 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -24+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm0 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu %ymm0,(%rsp) + vpaddq %ymm12,%ymm2,%ymm1 + vmovdqu -24+128-128(%rcx),%ymm0 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm2 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm3 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm4 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm5 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+288-128(%rcx),%ymm13 + movq %r12,%r9 + vpaddq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm11,%ymm12,%ymm12 + addq (%rsp),%r9 + vpaddq %ymm12,%ymm8,%ymm7 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovq %r12,%xmm12 + vpaddq %ymm13,%ymm9,%ymm8 + + decl %r14d + jnz .Loop_mul_1024 + vpermq $0,%ymm15,%ymm15 + vpaddq (%rsp),%ymm12,%ymm0 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm10,%ymm10 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpermq $0x93,%ymm11,%ymm11 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm10,%ymm10 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vmovdqu %ymm0,0-128(%rdi) + vmovdqu %ymm1,32-128(%rdi) + vmovdqu %ymm2,64-128(%rdi) + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vmovdqu %ymm4,128-128(%rdi) + vmovdqu %ymm5,160-128(%rdi) + vmovdqu %ymm6,192-128(%rdi) + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + vzeroupper + + movq %rbp,%rax +.cfi_def_cfa_register %rax + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lmul_1024_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 +.globl rsaz_1024_red2norm_avx2 +.hidden rsaz_1024_red2norm_avx2 +.type rsaz_1024_red2norm_avx2,@function +.align 32 +rsaz_1024_red2norm_avx2: + subq $-128,%rsi + xorq %rax,%rax + movq -128(%rsi),%r8 + movq -120(%rsi),%r9 + movq -112(%rsi),%r10 + shlq $0,%r8 + shlq $29,%r9 + movq %r10,%r11 + shlq $58,%r10 + shrq $6,%r11 + addq %r8,%rax + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,0(%rdi) + movq %r11,%rax + movq -104(%rsi),%r8 + movq -96(%rsi),%r9 + shlq $23,%r8 + movq %r9,%r10 + shlq $52,%r9 + shrq $12,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,8(%rdi) + movq %r10,%rax + movq -88(%rsi),%r11 + movq -80(%rsi),%r8 + shlq $17,%r11 + movq %r8,%r9 + shlq $46,%r8 + shrq $18,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,16(%rdi) + movq %r9,%rax + movq -72(%rsi),%r10 + movq -64(%rsi),%r11 + shlq $11,%r10 + movq %r11,%r8 + shlq $40,%r11 + shrq $24,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,24(%rdi) + movq %r8,%rax + movq -56(%rsi),%r9 + movq -48(%rsi),%r10 + movq -40(%rsi),%r11 + shlq $5,%r9 + shlq $34,%r10 + movq %r11,%r8 + shlq $63,%r11 + shrq $1,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,32(%rdi) + movq %r8,%rax + movq -32(%rsi),%r9 + movq -24(%rsi),%r10 + shlq $28,%r9 + movq %r10,%r11 + shlq $57,%r10 + shrq $7,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,40(%rdi) + movq %r11,%rax + movq -16(%rsi),%r8 + movq -8(%rsi),%r9 + shlq $22,%r8 + movq %r9,%r10 + shlq $51,%r9 + shrq $13,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,48(%rdi) + movq %r10,%rax + movq 0(%rsi),%r11 + movq 8(%rsi),%r8 + shlq $16,%r11 + movq %r8,%r9 + shlq $45,%r8 + shrq $19,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,56(%rdi) + movq %r9,%rax + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + shlq $10,%r10 + movq %r11,%r8 + shlq $39,%r11 + shrq $25,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,64(%rdi) + movq %r8,%rax + movq 32(%rsi),%r9 + movq 40(%rsi),%r10 + movq 48(%rsi),%r11 + shlq $4,%r9 + shlq $33,%r10 + movq %r11,%r8 + shlq $62,%r11 + shrq $2,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,72(%rdi) + movq %r8,%rax + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + shlq $27,%r9 + movq %r10,%r11 + shlq $56,%r10 + shrq $8,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,80(%rdi) + movq %r11,%rax + movq 72(%rsi),%r8 + movq 80(%rsi),%r9 + shlq $21,%r8 + movq %r9,%r10 + shlq $50,%r9 + shrq $14,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,88(%rdi) + movq %r10,%rax + movq 88(%rsi),%r11 + movq 96(%rsi),%r8 + shlq $15,%r11 + movq %r8,%r9 + shlq $44,%r8 + shrq $20,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,96(%rdi) + movq %r9,%rax + movq 104(%rsi),%r10 + movq 112(%rsi),%r11 + shlq $9,%r10 + movq %r11,%r8 + shlq $38,%r11 + shrq $26,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,104(%rdi) + movq %r8,%rax + movq 120(%rsi),%r9 + movq 128(%rsi),%r10 + movq 136(%rsi),%r11 + shlq $3,%r9 + shlq $32,%r10 + movq %r11,%r8 + shlq $61,%r11 + shrq $3,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,112(%rdi) + movq %r8,%rax + movq 144(%rsi),%r9 + movq 152(%rsi),%r10 + shlq $26,%r9 + movq %r10,%r11 + shlq $55,%r10 + shrq $9,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,120(%rdi) + movq %r11,%rax + .byte 0xf3,0xc3 +.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 + +.globl rsaz_1024_norm2red_avx2 +.hidden rsaz_1024_norm2red_avx2 +.type rsaz_1024_norm2red_avx2,@function +.align 32 +rsaz_1024_norm2red_avx2: + subq $-128,%rdi + movq (%rsi),%r8 + movl $0x1fffffff,%eax + movq 8(%rsi),%r9 + movq %r8,%r11 + shrq $0,%r11 + andq %rax,%r11 + movq %r11,-128(%rdi) + movq %r8,%r10 + shrq $29,%r10 + andq %rax,%r10 + movq %r10,-120(%rdi) + shrdq $58,%r9,%r8 + andq %rax,%r8 + movq %r8,-112(%rdi) + movq 16(%rsi),%r10 + movq %r9,%r8 + shrq $23,%r8 + andq %rax,%r8 + movq %r8,-104(%rdi) + shrdq $52,%r10,%r9 + andq %rax,%r9 + movq %r9,-96(%rdi) + movq 24(%rsi),%r11 + movq %r10,%r9 + shrq $17,%r9 + andq %rax,%r9 + movq %r9,-88(%rdi) + shrdq $46,%r11,%r10 + andq %rax,%r10 + movq %r10,-80(%rdi) + movq 32(%rsi),%r8 + movq %r11,%r10 + shrq $11,%r10 + andq %rax,%r10 + movq %r10,-72(%rdi) + shrdq $40,%r8,%r11 + andq %rax,%r11 + movq %r11,-64(%rdi) + movq 40(%rsi),%r9 + movq %r8,%r11 + shrq $5,%r11 + andq %rax,%r11 + movq %r11,-56(%rdi) + movq %r8,%r10 + shrq $34,%r10 + andq %rax,%r10 + movq %r10,-48(%rdi) + shrdq $63,%r9,%r8 + andq %rax,%r8 + movq %r8,-40(%rdi) + movq 48(%rsi),%r10 + movq %r9,%r8 + shrq $28,%r8 + andq %rax,%r8 + movq %r8,-32(%rdi) + shrdq $57,%r10,%r9 + andq %rax,%r9 + movq %r9,-24(%rdi) + movq 56(%rsi),%r11 + movq %r10,%r9 + shrq $22,%r9 + andq %rax,%r9 + movq %r9,-16(%rdi) + shrdq $51,%r11,%r10 + andq %rax,%r10 + movq %r10,-8(%rdi) + movq 64(%rsi),%r8 + movq %r11,%r10 + shrq $16,%r10 + andq %rax,%r10 + movq %r10,0(%rdi) + shrdq $45,%r8,%r11 + andq %rax,%r11 + movq %r11,8(%rdi) + movq 72(%rsi),%r9 + movq %r8,%r11 + shrq $10,%r11 + andq %rax,%r11 + movq %r11,16(%rdi) + shrdq $39,%r9,%r8 + andq %rax,%r8 + movq %r8,24(%rdi) + movq 80(%rsi),%r10 + movq %r9,%r8 + shrq $4,%r8 + andq %rax,%r8 + movq %r8,32(%rdi) + movq %r9,%r11 + shrq $33,%r11 + andq %rax,%r11 + movq %r11,40(%rdi) + shrdq $62,%r10,%r9 + andq %rax,%r9 + movq %r9,48(%rdi) + movq 88(%rsi),%r11 + movq %r10,%r9 + shrq $27,%r9 + andq %rax,%r9 + movq %r9,56(%rdi) + shrdq $56,%r11,%r10 + andq %rax,%r10 + movq %r10,64(%rdi) + movq 96(%rsi),%r8 + movq %r11,%r10 + shrq $21,%r10 + andq %rax,%r10 + movq %r10,72(%rdi) + shrdq $50,%r8,%r11 + andq %rax,%r11 + movq %r11,80(%rdi) + movq 104(%rsi),%r9 + movq %r8,%r11 + shrq $15,%r11 + andq %rax,%r11 + movq %r11,88(%rdi) + shrdq $44,%r9,%r8 + andq %rax,%r8 + movq %r8,96(%rdi) + movq 112(%rsi),%r10 + movq %r9,%r8 + shrq $9,%r8 + andq %rax,%r8 + movq %r8,104(%rdi) + shrdq $38,%r10,%r9 + andq %rax,%r9 + movq %r9,112(%rdi) + movq 120(%rsi),%r11 + movq %r10,%r9 + shrq $3,%r9 + andq %rax,%r9 + movq %r9,120(%rdi) + movq %r10,%r8 + shrq $32,%r8 + andq %rax,%r8 + movq %r8,128(%rdi) + shrdq $61,%r11,%r10 + andq %rax,%r10 + movq %r10,136(%rdi) + xorq %r8,%r8 + movq %r11,%r10 + shrq $26,%r10 + andq %rax,%r10 + movq %r10,144(%rdi) + shrdq $55,%r8,%r11 + andq %rax,%r11 + movq %r11,152(%rdi) + movq %r8,160(%rdi) + movq %r8,168(%rdi) + movq %r8,176(%rdi) + movq %r8,184(%rdi) + .byte 0xf3,0xc3 +.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 +.globl rsaz_1024_scatter5_avx2 +.hidden rsaz_1024_scatter5_avx2 +.type rsaz_1024_scatter5_avx2,@function +.align 32 +rsaz_1024_scatter5_avx2: + vzeroupper + vmovdqu .Lscatter_permd(%rip),%ymm5 + shll $4,%edx + leaq (%rdi,%rdx,1),%rdi + movl $9,%eax + jmp .Loop_scatter_1024 + +.align 32 +.Loop_scatter_1024: + vmovdqu (%rsi),%ymm0 + leaq 32(%rsi),%rsi + vpermd %ymm0,%ymm5,%ymm0 + vmovdqu %xmm0,(%rdi) + leaq 512(%rdi),%rdi + decl %eax + jnz .Loop_scatter_1024 + + vzeroupper + .byte 0xf3,0xc3 +.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 + +.globl rsaz_1024_gather5_avx2 +.hidden rsaz_1024_gather5_avx2 +.type rsaz_1024_gather5_avx2,@function +.align 32 +rsaz_1024_gather5_avx2: +.cfi_startproc + vzeroupper + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + leaq -256(%rsp),%rsp + andq $-32,%rsp + leaq .Linc(%rip),%r10 + leaq -128(%rsp),%rax + + vmovd %edx,%xmm4 + vmovdqa (%r10),%ymm0 + vmovdqa 32(%r10),%ymm1 + vmovdqa 64(%r10),%ymm5 + vpbroadcastd %xmm4,%ymm4 + + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,0+128(%rax) + vpaddd %ymm5,%ymm2,%ymm0 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,32+128(%rax) + vpaddd %ymm5,%ymm3,%ymm1 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,64+128(%rax) + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vmovdqa %ymm3,96+128(%rax) + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,128+128(%rax) + vpaddd %ymm5,%ymm2,%ymm8 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,160+128(%rax) + vpaddd %ymm5,%ymm3,%ymm9 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,192+128(%rax) + vpaddd %ymm5,%ymm8,%ymm10 + vpcmpeqd %ymm4,%ymm8,%ymm8 + vmovdqa %ymm3,224+128(%rax) + vpaddd %ymm5,%ymm9,%ymm11 + vpcmpeqd %ymm4,%ymm9,%ymm9 + vpaddd %ymm5,%ymm10,%ymm12 + vpcmpeqd %ymm4,%ymm10,%ymm10 + vpaddd %ymm5,%ymm11,%ymm13 + vpcmpeqd %ymm4,%ymm11,%ymm11 + vpaddd %ymm5,%ymm12,%ymm14 + vpcmpeqd %ymm4,%ymm12,%ymm12 + vpaddd %ymm5,%ymm13,%ymm15 + vpcmpeqd %ymm4,%ymm13,%ymm13 + vpcmpeqd %ymm4,%ymm14,%ymm14 + vpcmpeqd %ymm4,%ymm15,%ymm15 + + vmovdqa -32(%r10),%ymm7 + leaq 128(%rsi),%rsi + movl $9,%edx + +.Loop_gather_1024: + vmovdqa 0-128(%rsi),%ymm0 + vmovdqa 32-128(%rsi),%ymm1 + vmovdqa 64-128(%rsi),%ymm2 + vmovdqa 96-128(%rsi),%ymm3 + vpand 0+128(%rax),%ymm0,%ymm0 + vpand 32+128(%rax),%ymm1,%ymm1 + vpand 64+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm1,%ymm4 + vpand 96+128(%rax),%ymm3,%ymm3 + vmovdqa 128-128(%rsi),%ymm0 + vmovdqa 160-128(%rsi),%ymm1 + vpor %ymm2,%ymm3,%ymm5 + vmovdqa 192-128(%rsi),%ymm2 + vmovdqa 224-128(%rsi),%ymm3 + vpand 128+128(%rax),%ymm0,%ymm0 + vpand 160+128(%rax),%ymm1,%ymm1 + vpand 192+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm4,%ymm4 + vpand 224+128(%rax),%ymm3,%ymm3 + vpand 256-128(%rsi),%ymm8,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 288-128(%rsi),%ymm9,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 320-128(%rsi),%ymm10,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 352-128(%rsi),%ymm11,%ymm3 + vpor %ymm0,%ymm4,%ymm4 + vpand 384-128(%rsi),%ymm12,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 416-128(%rsi),%ymm13,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 448-128(%rsi),%ymm14,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 480-128(%rsi),%ymm15,%ymm3 + leaq 512(%rsi),%rsi + vpor %ymm0,%ymm4,%ymm4 + vpor %ymm1,%ymm5,%ymm5 + vpor %ymm2,%ymm4,%ymm4 + vpor %ymm3,%ymm5,%ymm5 + + vpor %ymm5,%ymm4,%ymm4 + vextracti128 $1,%ymm4,%xmm5 + vpor %xmm4,%xmm5,%xmm5 + vpermd %ymm5,%ymm7,%ymm5 + vmovdqu %ymm5,(%rdi) + leaq 32(%rdi),%rdi + decl %edx + jnz .Loop_gather_1024 + + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + vzeroupper + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp + .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_rsaz_1024_gather5: +.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 +.extern OPENSSL_ia32cap_P +.hidden OPENSSL_ia32cap_P +.globl rsaz_avx2_eligible +.hidden rsaz_avx2_eligible +.type rsaz_avx2_eligible,@function +.align 32 +rsaz_avx2_eligible: + leaq OPENSSL_ia32cap_P(%rip),%rax + movl 8(%rax),%eax + andl $32,%eax + shrl $5,%eax + .byte 0xf3,0xc3 +.size rsaz_avx2_eligible,.-rsaz_avx2_eligible + +.align 64 +.Land_mask: +.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 +.Lscatter_permd: +.long 0,2,4,6,7,7,7,7 +.Lgather_permd: +.long 0,7,1,7,2,7,3,7 +.Linc: +.long 0,0,0,0, 1,1,1,1 +.long 2,2,2,2, 3,3,3,3 +.long 4,4,4,4, 4,4,4,4 +.align 64 +#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha1-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S similarity index 98% rename from packager/third_party/boringssl/linux-x86_64/crypto/sha/sha1-x86_64.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S index d830b534de..7f924dcc1e 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha1-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .extern OPENSSL_ia32cap_P .hidden OPENSSL_ia32cap_P @@ -8,9 +8,10 @@ .type sha1_block_data_order,@function .align 16 sha1_block_data_order: - movl OPENSSL_ia32cap_P+0(%rip),%r9d - movl OPENSSL_ia32cap_P+4(%rip),%r8d - movl OPENSSL_ia32cap_P+8(%rip),%r10d + leaq OPENSSL_ia32cap_P(%rip),%r10 + movl 0(%r10),%r9d + movl 4(%r10),%r8d + movl 8(%r10),%r10d testl $512,%r8d jz .Lialu andl $268435456,%r8d @@ -1241,14 +1242,13 @@ sha1_block_data_order: .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: - movq %rsp,%rax + movq %rsp,%r11 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 leaq -64(%rsp),%rsp - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -1256,7 +1256,7 @@ _ssse3_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1268,8 +1268,8 @@ _ssse3_shortcut: xorl %edx,%edi andl %edi,%esi - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -1345,7 +1345,7 @@ _ssse3_shortcut: pslld $2,%xmm9 pxor %xmm10,%xmm4 xorl %ebp,%edx - movdqa -64(%r11),%xmm10 + movdqa -64(%r14),%xmm10 roll $5,%ecx addl %edi,%ebx andl %edx,%esi @@ -1406,7 +1406,7 @@ _ssse3_shortcut: pslld $2,%xmm10 pxor %xmm8,%xmm5 xorl %eax,%ebp - movdqa -32(%r11),%xmm8 + movdqa -32(%r14),%xmm8 roll $5,%edx addl %edi,%ecx andl %ebp,%esi @@ -1467,7 +1467,7 @@ _ssse3_shortcut: pslld $2,%xmm8 pxor %xmm9,%xmm6 xorl %ebx,%eax - movdqa -32(%r11),%xmm9 + movdqa -32(%r14),%xmm9 roll $5,%ebp addl %edi,%edx andl %eax,%esi @@ -1528,7 +1528,7 @@ _ssse3_shortcut: pslld $2,%xmm9 pxor %xmm10,%xmm7 xorl %ecx,%ebx - movdqa -32(%r11),%xmm10 + movdqa -32(%r14),%xmm10 roll $5,%eax addl %edi,%ebp andl %ebx,%esi @@ -1639,7 +1639,7 @@ _ssse3_shortcut: pxor %xmm3,%xmm2 addl %esi,%eax xorl %edx,%edi - movdqa 0(%r11),%xmm10 + movdqa 0(%r14),%xmm10 rorl $7,%ecx paddd %xmm1,%xmm9 addl %ebx,%eax @@ -1874,7 +1874,7 @@ _ssse3_shortcut: pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - movdqa 32(%r11),%xmm9 + movdqa 32(%r14),%xmm9 xorl %ecx,%edi paddd %xmm6,%xmm8 xorl %edx,%ecx @@ -2165,8 +2165,8 @@ _ssse3_shortcut: addl %edx,%ecx cmpq %r10,%r9 je .Ldone_ssse3 - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2403,13 +2403,12 @@ _ssse3_shortcut: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 @@ -2417,7 +2416,7 @@ _ssse3_shortcut: .align 16 sha1_block_data_order_avx: _avx_shortcut: - movq %rsp,%rax + movq %rsp,%r11 pushq %rbx pushq %rbp pushq %r12 @@ -2425,7 +2424,6 @@ _avx_shortcut: pushq %r14 leaq -64(%rsp),%rsp vzeroupper - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -2433,7 +2431,7 @@ _avx_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -2445,8 +2443,8 @@ _avx_shortcut: xorl %edx,%edi andl %edi,%esi - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -2571,7 +2569,7 @@ _avx_shortcut: vpxor %xmm10,%xmm5,%xmm5 xorl %eax,%ebp shldl $5,%edx,%edx - vmovdqa -32(%r11),%xmm11 + vmovdqa -32(%r14),%xmm11 addl %edi,%ecx andl %ebp,%esi xorl %eax,%ebp @@ -2784,7 +2782,7 @@ _avx_shortcut: addl %esi,%eax xorl %edx,%edi vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r11),%xmm11 + vmovdqa 0(%r14),%xmm11 shrdl $7,%ecx,%ecx addl %ebx,%eax vpxor %xmm8,%xmm2,%xmm2 @@ -3003,7 +3001,7 @@ _avx_shortcut: movl %ebx,%edi xorl %edx,%esi vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r11),%xmm11 + vmovdqa 32(%r14),%xmm11 shldl $5,%ebx,%ebx addl %esi,%eax vpxor %xmm8,%xmm7,%xmm7 @@ -3282,8 +3280,8 @@ _avx_shortcut: addl %edx,%ecx cmpq %r10,%r9 je .Ldone_avx - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -3519,13 +3517,12 @@ _avx_shortcut: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp .Lepilogue_avx: .byte 0xf3,0xc3 .size sha1_block_data_order_avx,.-sha1_block_data_order_avx diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha256-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S similarity index 99% rename from packager/third_party/boringssl/linux-x86_64/crypto/sha/sha256-x86_64.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S index 445b497e88..62534be495 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha256-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .extern OPENSSL_ia32cap_P @@ -19,13 +19,13 @@ sha256_block_data_order: je .Lavx_shortcut testl $512,%r10d jnz .Lssse3_shortcut + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -33,7 +33,7 @@ sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) .Lprologue: movl 0(%rdi),%eax @@ -1698,13 +1698,13 @@ sha256_block_data_order: jb .Lloop movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue: .byte 0xf3,0xc3 .size sha256_block_data_order,.-sha256_block_data_order @@ -1755,13 +1755,13 @@ K256: .align 64 sha256_block_data_order_ssse3: .Lssse3_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1769,7 +1769,7 @@ sha256_block_data_order_ssse3: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) .Lprologue_ssse3: movl 0(%rdi),%eax @@ -2836,13 +2836,13 @@ sha256_block_data_order_ssse3: jb .Lloop_ssse3 movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 @@ -2850,13 +2850,13 @@ sha256_block_data_order_ssse3: .align 64 sha256_block_data_order_avx: .Lavx_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -2864,7 +2864,7 @@ sha256_block_data_order_avx: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) .Lprologue_avx: vzeroupper @@ -3893,13 +3893,13 @@ sha256_block_data_order_avx: movq 64+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_avx: .byte 0xf3,0xc3 .size sha256_block_data_order_avx,.-sha256_block_data_order_avx diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha512-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S similarity index 99% rename from packager/third_party/boringssl/linux-x86_64/crypto/sha/sha512-x86_64.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S index d65743fd52..1f1793bb0f 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/sha/sha512-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .extern OPENSSL_ia32cap_P @@ -19,13 +19,13 @@ sha512_block_data_order: orl %r9d,%r10d cmpl $1342177792,%r10d je .Lavx_shortcut + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -33,7 +33,7 @@ sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) .Lprologue: movq 0(%rdi),%rax @@ -1698,13 +1698,13 @@ sha512_block_data_order: jb .Lloop movq 128+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue: .byte 0xf3,0xc3 .size sha512_block_data_order,.-sha512_block_data_order @@ -1799,13 +1799,13 @@ K512: .align 64 sha512_block_data_order_xop: .Lxop_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -1813,7 +1813,7 @@ sha512_block_data_order_xop: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) .Lprologue_xop: vzeroupper @@ -2868,13 +2868,13 @@ sha512_block_data_order_xop: movq 128+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_xop: .byte 0xf3,0xc3 .size sha512_block_data_order_xop,.-sha512_block_data_order_xop @@ -2882,13 +2882,13 @@ sha512_block_data_order_xop: .align 64 sha512_block_data_order_avx: .Lavx_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2896,7 +2896,7 @@ sha512_block_data_order_avx: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) .Lprologue_avx: vzeroupper @@ -4015,13 +4015,13 @@ sha512_block_data_order_avx: movq 128+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp .Lepilogue_avx: .byte 0xf3,0xc3 .size sha512_block_data_order_avx,.-sha512_block_data_order_avx diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/aes/vpaes-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S similarity index 99% rename from packager/third_party/boringssl/linux-x86_64/crypto/aes/vpaes-x86_64.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S index 4dfafa97ea..f3a089de9c 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/aes/vpaes-x86_64.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S similarity index 76% rename from packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S index 83926ad789..b32e2f0ef4 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .extern OPENSSL_ia32cap_P @@ -9,6 +9,10 @@ .type bn_mul_mont,@function .align 16 bn_mul_mont: +.cfi_startproc + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax testl $3,%r9d jnz .Lmul_enter cmpl $8,%r9d @@ -22,20 +26,50 @@ bn_mul_mont: .align 16 .Lmul_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 - movl %r9d,%r9d - leaq 2(%r9),%r10 + negq %r9 movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + leaq -16(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 - movq %r11,8(%rsp,%r9,8) + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.align 16 +.Lmul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul_body: movq %rdx,%r12 movq (%r8),%r8 @@ -178,7 +212,8 @@ bn_mul_mont: movq %r9,%r15 jmp .Lsub .align 16 -.Lsub: sbbq (%rcx,%r14,8),%rax +.Lsub: + sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsi,%r14,8),%rax leaq 1(%r14),%r14 @@ -187,51 +222,86 @@ bn_mul_mont: sbbq $0,%rax xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx movq %r9,%r15 + orq %rcx,%rsi .align 16 .Lcopy: - movq (%rsp,%r14,8),%rsi - movq (%rdi,%r14,8),%rcx - xorq %rcx,%rsi - andq %rax,%rsi - xorq %rcx,%rsi + movq (%rsi,%r14,8),%rax movq %r14,(%rsp,%r14,8) - movq %rsi,(%rdi,%r14,8) + movq %rax,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul_mont,.-bn_mul_mont .type bn_mul4x_mont,@function .align 16 bn_mul4x_mont: +.cfi_startproc + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax .Lmul4x_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 - movl %r9d,%r9d - leaq 4(%r9),%r10 + negq %r9 movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + leaq -32(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 - movq %r11,8(%rsp,%r9,8) + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + + movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 @@ -531,9 +601,11 @@ bn_mul4x_mont: cmpq %r9,%r14 jb .Louter4x movq 16(%rsp,%r9,8),%rdi + leaq -4(%r9),%r15 movq 0(%rsp),%rax + pxor %xmm0,%xmm0 movq 8(%rsp),%rdx - shrq $2,%r9 + shrq $2,%r15 leaq (%rsp),%rsi xorq %r14,%r14 @@ -541,7 +613,6 @@ bn_mul4x_mont: movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx - leaq -1(%r9),%r15 jmp .Lsub4x .align 16 .Lsub4x: @@ -569,47 +640,55 @@ bn_mul4x_mont: movq %rbx,16(%rdi,%r14,8) sbbq $0,%rax - movq %rax,%xmm0 - punpcklqdq %xmm0,%xmm0 movq %rbp,24(%rdi,%r14,8) xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + leaq -4(%r9),%r15 + orq %rcx,%rsi + shrq $2,%r15 - movq %r9,%r15 - pxor %xmm5,%xmm5 + movdqu (%rsi),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,(%rdi) jmp .Lcopy4x .align 16 .Lcopy4x: - movdqu (%rsp,%r14,1),%xmm2 - movdqu 16(%rsp,%r14,1),%xmm4 - movdqu (%rdi,%r14,1),%xmm1 - movdqu 16(%rdi,%r14,1),%xmm3 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - pand %xmm0,%xmm2 - pand %xmm0,%xmm4 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - movdqu %xmm2,(%rdi,%r14,1) - movdqu %xmm4,16(%rdi,%r14,1) - movdqa %xmm5,(%rsp,%r14,1) - movdqa %xmm5,16(%rsp,%r14,1) - + movdqu 16(%rsi,%r14,1),%xmm2 + movdqu 32(%rsi,%r14,1),%xmm1 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movdqa %xmm0,32(%rsp,%r14,1) + movdqu %xmm1,32(%rdi,%r14,1) leaq 32(%r14),%r14 decq %r15 jnz .Lcopy4x - shlq $2,%r9 + movdqu 16(%rsi,%r14,1),%xmm2 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi, 8 movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont .extern bn_sqr8x_internal .hidden bn_sqr8x_internal @@ -617,14 +696,23 @@ bn_mul4x_mont: .type bn_sqr8x_mont,@function .align 32 bn_sqr8x_mont: -.Lsqr8x_enter: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lsqr8x_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 +.Lsqr8x_prologue: movl %r9d,%r10d shll $3,%r9d @@ -637,30 +725,49 @@ bn_sqr8x_mont: leaq -64(%rsp,%r9,2),%r11 + movq %rsp,%rbp movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lsqr8x_sp_alt - subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -64(%rbp,%r9,2),%rbp jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq -64(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lsqr8x_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lsqr8x_page_walk + jmp .Lsqr8x_page_walk_done + +.align 16 +.Lsqr8x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lsqr8x_page_walk +.Lsqr8x_page_walk_done: + movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lsqr8x_body: .byte 102,72,15,110,209 @@ -707,6 +814,7 @@ bn_sqr8x_mont: pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 jmp .Lsqr8x_cond_copy .align 32 @@ -736,14 +844,22 @@ bn_sqr8x_mont: movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lsqr8x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S similarity index 91% rename from packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S rename to packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S index 554df1ffac..208b1dca3e 100644 --- a/packager/third_party/boringssl/linux-x86_64/crypto/bn/x86_64-mont5.S +++ b/packager/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .extern OPENSSL_ia32cap_P @@ -9,30 +9,64 @@ .type bn_mul_mont_gather5,@function .align 64 bn_mul_mont_gather5: +.cfi_startproc + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax testl $7,%r9d jnz .Lmul_enter jmp .Lmul4x_enter .align 16 .Lmul_enter: - movl %r9d,%r9d - movq %rsp,%rax movd 8(%rsp),%xmm5 - leaq .Linc(%rip),%r10 pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 - leaq 2(%r9),%r11 - negq %r11 - leaq -264(%rsp,%r11,8),%rsp - andq $-1024,%rsp + negq %r9 + movq %rsp,%r11 + leaq -280(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.Lmul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + leaq .Linc(%rip),%r10 movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul_body: + leaq 128(%rdx),%r12 movdqa 0(%r10),%xmm0 movdqa 16(%r10),%xmm1 @@ -362,7 +396,8 @@ bn_mul_mont_gather5: movq %r9,%r15 jmp .Lsub .align 16 -.Lsub: sbbq (%rcx,%r14,8),%rax +.Lsub: + sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsi,%r14,8),%rax leaq 1(%r14),%r14 @@ -371,45 +406,64 @@ bn_mul_mont_gather5: sbbq $0,%rax xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx movq %r9,%r15 + orq %rcx,%rsi .align 16 .Lcopy: - movq (%rsp,%r14,8),%rsi - movq (%rdi,%r14,8),%rcx - xorq %rcx,%rsi - andq %rax,%rsi - xorq %rcx,%rsi + movq (%rsi,%r14,8),%rax movq %r14,(%rsp,%r14,8) - movq %rsi,(%rdi,%r14,8) + movq %rax,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 .type bn_mul4x_mont_gather5,@function .align 32 bn_mul4x_mont_gather5: -.Lmul4x_enter: +.cfi_startproc .byte 0x67 movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmul4x_enter: pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 +.Lmul4x_prologue: .byte 0x67 shll $3,%r9d @@ -426,43 +480,70 @@ bn_mul4x_mont_gather5: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmul4xsp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lmul4xsp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + negq %r9 movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lmul4x_body: call mul4x_internal movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lmul4x_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,@function @@ -995,13 +1076,22 @@ mul4x_internal: .type bn_power5,@function .align 32 bn_power5: +.cfi_startproc movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 +.Lpower5_prologue: shll $3,%r9d leal (%r9,%r9,2),%r10d @@ -1016,24 +1106,41 @@ bn_power5: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwr_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lpwr_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + +.Lpwr_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: + movq %r9,%r10 negq %r9 @@ -1048,6 +1155,7 @@ bn_power5: movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lpower5_body: .byte 102,72,15,110,207 .byte 102,72,15,110,209 @@ -1074,16 +1182,25 @@ bn_power5: call mul4x_internal movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lpower5_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_power5,.-bn_power5 .globl bn_sqr8x_internal @@ -1826,6 +1943,7 @@ __bn_sqr8x_reduction: .align 32 .L8x_tail_done: + xorq %rax,%rax addq (%rdx),%r8 adcq $0,%r9 adcq $0,%r10 @@ -1834,9 +1952,7 @@ __bn_sqr8x_reduction: adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 - - - xorq %rax,%rax + adcq $0,%rax negq %rsi .L8x_no_tail: @@ -1937,14 +2053,23 @@ bn_from_montgomery: .type bn_from_mont8x,@function .align 32 bn_from_mont8x: +.cfi_startproc .byte 0x67 movq %rsp,%rax +.cfi_def_cfa_register %rax pushq %rbx +.cfi_offset %rbx,-16 pushq %rbp +.cfi_offset %rbp,-24 pushq %r12 +.cfi_offset %r12,-32 pushq %r13 +.cfi_offset %r13,-40 pushq %r14 +.cfi_offset %r14,-48 pushq %r15 +.cfi_offset %r15,-56 +.Lfrom_prologue: shll $3,%r9d leaq (%r9,%r9,2),%r10 @@ -1959,24 +2084,41 @@ bn_from_mont8x: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lfrom_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp .Lfrom_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lfrom_page_walk + jmp .Lfrom_page_walk_done + +.Lfrom_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lfrom_page_walk +.Lfrom_page_walk_done: + movq %r9,%r10 negq %r9 @@ -1991,6 +2133,7 @@ bn_from_mont8x: movq %r8,32(%rsp) movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lfrom_body: movq %r9,%r11 leaq 48(%rsp),%rax @@ -2026,11 +2169,12 @@ bn_from_mont8x: pxor %xmm0,%xmm0 leaq 48(%rsp),%rax - movq 40(%rsp),%rsi jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_zero: + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 movdqa %xmm0,0(%rax) movdqa %xmm0,16(%rax) movdqa %xmm0,32(%rax) @@ -2041,14 +2185,22 @@ bn_from_mont8x: movq $1,%rax movq -48(%rsi),%r15 +.cfi_restore %r15 movq -40(%rsi),%r14 +.cfi_restore %r14 movq -32(%rsi),%r13 +.cfi_restore %r13 movq -24(%rsi),%r12 +.cfi_restore %r12 movq -16(%rsi),%rbp +.cfi_restore %rbp movq -8(%rsi),%rbx +.cfi_restore %rbx leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lfrom_epilogue: .byte 0xf3,0xc3 +.cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x .globl bn_scatter5 .hidden bn_scatter5 diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/modes/aesni-gcm-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/modes/aesni-gcm-x86_64.S deleted file mode 100644 index f01692e0b1..0000000000 --- a/packager/third_party/boringssl/linux-x86_64/crypto/modes/aesni-gcm-x86_64.S +++ /dev/null @@ -1,19 +0,0 @@ -#if defined(__x86_64__) -.text - -.globl aesni_gcm_encrypt -.hidden aesni_gcm_encrypt -.type aesni_gcm_encrypt,@function -aesni_gcm_encrypt: - xorl %eax,%eax - .byte 0xf3,0xc3 -.size aesni_gcm_encrypt,.-aesni_gcm_encrypt - -.globl aesni_gcm_decrypt -.hidden aesni_gcm_decrypt -.type aesni_gcm_decrypt,@function -aesni_gcm_decrypt: - xorl %eax,%eax - .byte 0xf3,0xc3 -.size aesni_gcm_decrypt,.-aesni_gcm_decrypt -#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/rc4/rc4-md5-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/rc4/rc4-md5-x86_64.S deleted file mode 100644 index 06c8d672ab..0000000000 --- a/packager/third_party/boringssl/linux-x86_64/crypto/rc4/rc4-md5-x86_64.S +++ /dev/null @@ -1,1262 +0,0 @@ -#if defined(__x86_64__) -.text -.align 16 - -.globl rc4_md5_enc -.hidden rc4_md5_enc -.type rc4_md5_enc,@function -rc4_md5_enc: - cmpq $0,%r9 - je .Labort - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $40,%rsp -.Lbody: - movq %rcx,%r11 - movq %r9,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %r8,%r15 - xorq %rbp,%rbp - xorq %rcx,%rcx - - leaq 8(%rdi),%rdi - movb -8(%rdi),%bpl - movb -4(%rdi),%cl - - incb %bpl - subq %r13,%r14 - movl (%rdi,%rbp,4),%eax - addb %al,%cl - leaq (%rdi,%rbp,4),%rsi - shlq $6,%r12 - addq %r15,%r12 - movq %r12,16(%rsp) - - movq %r11,24(%rsp) - movl 0(%r11),%r8d - movl 4(%r11),%r9d - movl 8(%r11),%r10d - movl 12(%r11),%r11d - jmp .Loop - -.align 16 -.Loop: - movl %r8d,0(%rsp) - movl %r9d,4(%rsp) - movl %r10d,8(%rsp) - movl %r11d,%r12d - movl %r11d,12(%rsp) - pxor %xmm0,%xmm0 - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 0(%r15),%r8d - addb %dl,%al - movl 4(%rsi),%ebx - addl $3614090360,%r8d - xorl %r11d,%r12d - movzbl %al,%eax - movl %edx,0(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $7,%r8d - movl %r10d,%r12d - movd (%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - pxor %xmm1,%xmm1 - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 4(%r15),%r11d - addb %dl,%bl - movl 8(%rsi),%eax - addl $3905402710,%r11d - xorl %r10d,%r12d - movzbl %bl,%ebx - movl %edx,4(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $12,%r11d - movl %r9d,%r12d - movd (%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 8(%r15),%r10d - addb %dl,%al - movl 12(%rsi),%ebx - addl $606105819,%r10d - xorl %r9d,%r12d - movzbl %al,%eax - movl %edx,8(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $17,%r10d - movl %r8d,%r12d - pinsrw $1,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 12(%r15),%r9d - addb %dl,%bl - movl 16(%rsi),%eax - addl $3250441966,%r9d - xorl %r8d,%r12d - movzbl %bl,%ebx - movl %edx,12(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $22,%r9d - movl %r11d,%r12d - pinsrw $1,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 16(%r15),%r8d - addb %dl,%al - movl 20(%rsi),%ebx - addl $4118548399,%r8d - xorl %r11d,%r12d - movzbl %al,%eax - movl %edx,16(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $7,%r8d - movl %r10d,%r12d - pinsrw $2,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 20(%r15),%r11d - addb %dl,%bl - movl 24(%rsi),%eax - addl $1200080426,%r11d - xorl %r10d,%r12d - movzbl %bl,%ebx - movl %edx,20(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $12,%r11d - movl %r9d,%r12d - pinsrw $2,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 24(%r15),%r10d - addb %dl,%al - movl 28(%rsi),%ebx - addl $2821735955,%r10d - xorl %r9d,%r12d - movzbl %al,%eax - movl %edx,24(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $17,%r10d - movl %r8d,%r12d - pinsrw $3,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 28(%r15),%r9d - addb %dl,%bl - movl 32(%rsi),%eax - addl $4249261313,%r9d - xorl %r8d,%r12d - movzbl %bl,%ebx - movl %edx,28(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $22,%r9d - movl %r11d,%r12d - pinsrw $3,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 32(%r15),%r8d - addb %dl,%al - movl 36(%rsi),%ebx - addl $1770035416,%r8d - xorl %r11d,%r12d - movzbl %al,%eax - movl %edx,32(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $7,%r8d - movl %r10d,%r12d - pinsrw $4,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 36(%r15),%r11d - addb %dl,%bl - movl 40(%rsi),%eax - addl $2336552879,%r11d - xorl %r10d,%r12d - movzbl %bl,%ebx - movl %edx,36(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $12,%r11d - movl %r9d,%r12d - pinsrw $4,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 40(%r15),%r10d - addb %dl,%al - movl 44(%rsi),%ebx - addl $4294925233,%r10d - xorl %r9d,%r12d - movzbl %al,%eax - movl %edx,40(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $17,%r10d - movl %r8d,%r12d - pinsrw $5,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 44(%r15),%r9d - addb %dl,%bl - movl 48(%rsi),%eax - addl $2304563134,%r9d - xorl %r8d,%r12d - movzbl %bl,%ebx - movl %edx,44(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $22,%r9d - movl %r11d,%r12d - pinsrw $5,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 48(%r15),%r8d - addb %dl,%al - movl 52(%rsi),%ebx - addl $1804603682,%r8d - xorl %r11d,%r12d - movzbl %al,%eax - movl %edx,48(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $7,%r8d - movl %r10d,%r12d - pinsrw $6,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 52(%r15),%r11d - addb %dl,%bl - movl 56(%rsi),%eax - addl $4254626195,%r11d - xorl %r10d,%r12d - movzbl %bl,%ebx - movl %edx,52(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $12,%r11d - movl %r9d,%r12d - pinsrw $6,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 56(%r15),%r10d - addb %dl,%al - movl 60(%rsi),%ebx - addl $2792965006,%r10d - xorl %r9d,%r12d - movzbl %al,%eax - movl %edx,56(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $17,%r10d - movl %r8d,%r12d - pinsrw $7,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movdqu (%r13),%xmm2 - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 60(%r15),%r9d - addb %dl,%bl - movl 64(%rsi),%eax - addl $1236535329,%r9d - xorl %r8d,%r12d - movzbl %bl,%ebx - movl %edx,60(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $22,%r9d - movl %r10d,%r12d - pinsrw $7,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - psllq $8,%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm1,%xmm2 - pxor %xmm0,%xmm0 - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 4(%r15),%r8d - addb %dl,%al - movl 68(%rsi),%ebx - addl $4129170786,%r8d - xorl %r10d,%r12d - movzbl %al,%eax - movl %edx,64(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $5,%r8d - movl %r9d,%r12d - movd (%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - pxor %xmm1,%xmm1 - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 24(%r15),%r11d - addb %dl,%bl - movl 72(%rsi),%eax - addl $3225465664,%r11d - xorl %r9d,%r12d - movzbl %bl,%ebx - movl %edx,68(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $9,%r11d - movl %r8d,%r12d - movd (%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 44(%r15),%r10d - addb %dl,%al - movl 76(%rsi),%ebx - addl $643717713,%r10d - xorl %r8d,%r12d - movzbl %al,%eax - movl %edx,72(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $14,%r10d - movl %r11d,%r12d - pinsrw $1,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 0(%r15),%r9d - addb %dl,%bl - movl 80(%rsi),%eax - addl $3921069994,%r9d - xorl %r11d,%r12d - movzbl %bl,%ebx - movl %edx,76(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $20,%r9d - movl %r10d,%r12d - pinsrw $1,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 20(%r15),%r8d - addb %dl,%al - movl 84(%rsi),%ebx - addl $3593408605,%r8d - xorl %r10d,%r12d - movzbl %al,%eax - movl %edx,80(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $5,%r8d - movl %r9d,%r12d - pinsrw $2,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 40(%r15),%r11d - addb %dl,%bl - movl 88(%rsi),%eax - addl $38016083,%r11d - xorl %r9d,%r12d - movzbl %bl,%ebx - movl %edx,84(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $9,%r11d - movl %r8d,%r12d - pinsrw $2,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 60(%r15),%r10d - addb %dl,%al - movl 92(%rsi),%ebx - addl $3634488961,%r10d - xorl %r8d,%r12d - movzbl %al,%eax - movl %edx,88(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $14,%r10d - movl %r11d,%r12d - pinsrw $3,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 16(%r15),%r9d - addb %dl,%bl - movl 96(%rsi),%eax - addl $3889429448,%r9d - xorl %r11d,%r12d - movzbl %bl,%ebx - movl %edx,92(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $20,%r9d - movl %r10d,%r12d - pinsrw $3,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 36(%r15),%r8d - addb %dl,%al - movl 100(%rsi),%ebx - addl $568446438,%r8d - xorl %r10d,%r12d - movzbl %al,%eax - movl %edx,96(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $5,%r8d - movl %r9d,%r12d - pinsrw $4,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 56(%r15),%r11d - addb %dl,%bl - movl 104(%rsi),%eax - addl $3275163606,%r11d - xorl %r9d,%r12d - movzbl %bl,%ebx - movl %edx,100(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $9,%r11d - movl %r8d,%r12d - pinsrw $4,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 12(%r15),%r10d - addb %dl,%al - movl 108(%rsi),%ebx - addl $4107603335,%r10d - xorl %r8d,%r12d - movzbl %al,%eax - movl %edx,104(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $14,%r10d - movl %r11d,%r12d - pinsrw $5,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 32(%r15),%r9d - addb %dl,%bl - movl 112(%rsi),%eax - addl $1163531501,%r9d - xorl %r11d,%r12d - movzbl %bl,%ebx - movl %edx,108(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $20,%r9d - movl %r10d,%r12d - pinsrw $5,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 52(%r15),%r8d - addb %dl,%al - movl 116(%rsi),%ebx - addl $2850285829,%r8d - xorl %r10d,%r12d - movzbl %al,%eax - movl %edx,112(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $5,%r8d - movl %r9d,%r12d - pinsrw $6,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 8(%r15),%r11d - addb %dl,%bl - movl 120(%rsi),%eax - addl $4243563512,%r11d - xorl %r9d,%r12d - movzbl %bl,%ebx - movl %edx,116(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $9,%r11d - movl %r8d,%r12d - pinsrw $6,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 28(%r15),%r10d - addb %dl,%al - movl 124(%rsi),%ebx - addl $1735328473,%r10d - xorl %r8d,%r12d - movzbl %al,%eax - movl %edx,120(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $14,%r10d - movl %r11d,%r12d - pinsrw $7,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movdqu 16(%r13),%xmm3 - addb $32,%bpl - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 48(%r15),%r9d - addb %dl,%bl - movl 0(%rdi,%rbp,4),%eax - addl $2368359562,%r9d - xorl %r11d,%r12d - movzbl %bl,%ebx - movl %edx,124(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $20,%r9d - movl %r11d,%r12d - pinsrw $7,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movq %rcx,%rsi - xorq %rcx,%rcx - movb %sil,%cl - leaq (%rdi,%rbp,4),%rsi - psllq $8,%xmm1 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - pxor %xmm0,%xmm0 - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r9d,%r12d - addl 20(%r15),%r8d - addb %dl,%al - movl 4(%rsi),%ebx - addl $4294588738,%r8d - movzbl %al,%eax - addl %r12d,%r8d - movl %edx,0(%rsi) - addb %bl,%cl - roll $4,%r8d - movl %r10d,%r12d - movd (%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - pxor %xmm1,%xmm1 - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r8d,%r12d - addl 32(%r15),%r11d - addb %dl,%bl - movl 8(%rsi),%eax - addl $2272392833,%r11d - movzbl %bl,%ebx - addl %r12d,%r11d - movl %edx,4(%rsi) - addb %al,%cl - roll $11,%r11d - movl %r9d,%r12d - movd (%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r11d,%r12d - addl 44(%r15),%r10d - addb %dl,%al - movl 12(%rsi),%ebx - addl $1839030562,%r10d - movzbl %al,%eax - addl %r12d,%r10d - movl %edx,8(%rsi) - addb %bl,%cl - roll $16,%r10d - movl %r8d,%r12d - pinsrw $1,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r10d,%r12d - addl 56(%r15),%r9d - addb %dl,%bl - movl 16(%rsi),%eax - addl $4259657740,%r9d - movzbl %bl,%ebx - addl %r12d,%r9d - movl %edx,12(%rsi) - addb %al,%cl - roll $23,%r9d - movl %r11d,%r12d - pinsrw $1,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r9d,%r12d - addl 4(%r15),%r8d - addb %dl,%al - movl 20(%rsi),%ebx - addl $2763975236,%r8d - movzbl %al,%eax - addl %r12d,%r8d - movl %edx,16(%rsi) - addb %bl,%cl - roll $4,%r8d - movl %r10d,%r12d - pinsrw $2,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r8d,%r12d - addl 16(%r15),%r11d - addb %dl,%bl - movl 24(%rsi),%eax - addl $1272893353,%r11d - movzbl %bl,%ebx - addl %r12d,%r11d - movl %edx,20(%rsi) - addb %al,%cl - roll $11,%r11d - movl %r9d,%r12d - pinsrw $2,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r11d,%r12d - addl 28(%r15),%r10d - addb %dl,%al - movl 28(%rsi),%ebx - addl $4139469664,%r10d - movzbl %al,%eax - addl %r12d,%r10d - movl %edx,24(%rsi) - addb %bl,%cl - roll $16,%r10d - movl %r8d,%r12d - pinsrw $3,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r10d,%r12d - addl 40(%r15),%r9d - addb %dl,%bl - movl 32(%rsi),%eax - addl $3200236656,%r9d - movzbl %bl,%ebx - addl %r12d,%r9d - movl %edx,28(%rsi) - addb %al,%cl - roll $23,%r9d - movl %r11d,%r12d - pinsrw $3,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r9d,%r12d - addl 52(%r15),%r8d - addb %dl,%al - movl 36(%rsi),%ebx - addl $681279174,%r8d - movzbl %al,%eax - addl %r12d,%r8d - movl %edx,32(%rsi) - addb %bl,%cl - roll $4,%r8d - movl %r10d,%r12d - pinsrw $4,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r8d,%r12d - addl 0(%r15),%r11d - addb %dl,%bl - movl 40(%rsi),%eax - addl $3936430074,%r11d - movzbl %bl,%ebx - addl %r12d,%r11d - movl %edx,36(%rsi) - addb %al,%cl - roll $11,%r11d - movl %r9d,%r12d - pinsrw $4,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r11d,%r12d - addl 12(%r15),%r10d - addb %dl,%al - movl 44(%rsi),%ebx - addl $3572445317,%r10d - movzbl %al,%eax - addl %r12d,%r10d - movl %edx,40(%rsi) - addb %bl,%cl - roll $16,%r10d - movl %r8d,%r12d - pinsrw $5,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r10d,%r12d - addl 24(%r15),%r9d - addb %dl,%bl - movl 48(%rsi),%eax - addl $76029189,%r9d - movzbl %bl,%ebx - addl %r12d,%r9d - movl %edx,44(%rsi) - addb %al,%cl - roll $23,%r9d - movl %r11d,%r12d - pinsrw $5,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r9d,%r12d - addl 36(%r15),%r8d - addb %dl,%al - movl 52(%rsi),%ebx - addl $3654602809,%r8d - movzbl %al,%eax - addl %r12d,%r8d - movl %edx,48(%rsi) - addb %bl,%cl - roll $4,%r8d - movl %r10d,%r12d - pinsrw $6,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r8d,%r12d - addl 48(%r15),%r11d - addb %dl,%bl - movl 56(%rsi),%eax - addl $3873151461,%r11d - movzbl %bl,%ebx - addl %r12d,%r11d - movl %edx,52(%rsi) - addb %al,%cl - roll $11,%r11d - movl %r9d,%r12d - pinsrw $6,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r11d,%r12d - addl 60(%r15),%r10d - addb %dl,%al - movl 60(%rsi),%ebx - addl $530742520,%r10d - movzbl %al,%eax - addl %r12d,%r10d - movl %edx,56(%rsi) - addb %bl,%cl - roll $16,%r10d - movl %r8d,%r12d - pinsrw $7,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movdqu 32(%r13),%xmm4 - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r10d,%r12d - addl 8(%r15),%r9d - addb %dl,%bl - movl 64(%rsi),%eax - addl $3299628645,%r9d - movzbl %bl,%ebx - addl %r12d,%r9d - movl %edx,60(%rsi) - addb %al,%cl - roll $23,%r9d - movl $-1,%r12d - pinsrw $7,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - psllq $8,%xmm1 - pxor %xmm0,%xmm4 - pxor %xmm1,%xmm4 - pxor %xmm0,%xmm0 - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r9d,%r12d - addl 0(%r15),%r8d - addb %dl,%al - movl 68(%rsi),%ebx - addl $4096336452,%r8d - movzbl %al,%eax - xorl %r10d,%r12d - movl %edx,64(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $6,%r8d - movl $-1,%r12d - movd (%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - pxor %xmm1,%xmm1 - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r8d,%r12d - addl 28(%r15),%r11d - addb %dl,%bl - movl 72(%rsi),%eax - addl $1126891415,%r11d - movzbl %bl,%ebx - xorl %r9d,%r12d - movl %edx,68(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $10,%r11d - movl $-1,%r12d - movd (%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r11d,%r12d - addl 56(%r15),%r10d - addb %dl,%al - movl 76(%rsi),%ebx - addl $2878612391,%r10d - movzbl %al,%eax - xorl %r8d,%r12d - movl %edx,72(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $15,%r10d - movl $-1,%r12d - pinsrw $1,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r10d,%r12d - addl 20(%r15),%r9d - addb %dl,%bl - movl 80(%rsi),%eax - addl $4237533241,%r9d - movzbl %bl,%ebx - xorl %r11d,%r12d - movl %edx,76(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $21,%r9d - movl $-1,%r12d - pinsrw $1,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r9d,%r12d - addl 48(%r15),%r8d - addb %dl,%al - movl 84(%rsi),%ebx - addl $1700485571,%r8d - movzbl %al,%eax - xorl %r10d,%r12d - movl %edx,80(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $6,%r8d - movl $-1,%r12d - pinsrw $2,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r8d,%r12d - addl 12(%r15),%r11d - addb %dl,%bl - movl 88(%rsi),%eax - addl $2399980690,%r11d - movzbl %bl,%ebx - xorl %r9d,%r12d - movl %edx,84(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $10,%r11d - movl $-1,%r12d - pinsrw $2,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r11d,%r12d - addl 40(%r15),%r10d - addb %dl,%al - movl 92(%rsi),%ebx - addl $4293915773,%r10d - movzbl %al,%eax - xorl %r8d,%r12d - movl %edx,88(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $15,%r10d - movl $-1,%r12d - pinsrw $3,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r10d,%r12d - addl 4(%r15),%r9d - addb %dl,%bl - movl 96(%rsi),%eax - addl $2240044497,%r9d - movzbl %bl,%ebx - xorl %r11d,%r12d - movl %edx,92(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $21,%r9d - movl $-1,%r12d - pinsrw $3,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r9d,%r12d - addl 32(%r15),%r8d - addb %dl,%al - movl 100(%rsi),%ebx - addl $1873313359,%r8d - movzbl %al,%eax - xorl %r10d,%r12d - movl %edx,96(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $6,%r8d - movl $-1,%r12d - pinsrw $4,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r8d,%r12d - addl 60(%r15),%r11d - addb %dl,%bl - movl 104(%rsi),%eax - addl $4264355552,%r11d - movzbl %bl,%ebx - xorl %r9d,%r12d - movl %edx,100(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $10,%r11d - movl $-1,%r12d - pinsrw $4,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r11d,%r12d - addl 24(%r15),%r10d - addb %dl,%al - movl 108(%rsi),%ebx - addl $2734768916,%r10d - movzbl %al,%eax - xorl %r8d,%r12d - movl %edx,104(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $15,%r10d - movl $-1,%r12d - pinsrw $5,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r10d,%r12d - addl 52(%r15),%r9d - addb %dl,%bl - movl 112(%rsi),%eax - addl $1309151649,%r9d - movzbl %bl,%ebx - xorl %r11d,%r12d - movl %edx,108(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $21,%r9d - movl $-1,%r12d - pinsrw $5,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r9d,%r12d - addl 16(%r15),%r8d - addb %dl,%al - movl 116(%rsi),%ebx - addl $4149444226,%r8d - movzbl %al,%eax - xorl %r10d,%r12d - movl %edx,112(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $6,%r8d - movl $-1,%r12d - pinsrw $6,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r8d,%r12d - addl 44(%r15),%r11d - addb %dl,%bl - movl 120(%rsi),%eax - addl $3174756917,%r11d - movzbl %bl,%ebx - xorl %r9d,%r12d - movl %edx,116(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $10,%r11d - movl $-1,%r12d - pinsrw $6,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r11d,%r12d - addl 8(%r15),%r10d - addb %dl,%al - movl 124(%rsi),%ebx - addl $718787259,%r10d - movzbl %al,%eax - xorl %r8d,%r12d - movl %edx,120(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $15,%r10d - movl $-1,%r12d - pinsrw $7,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movdqu 48(%r13),%xmm5 - addb $32,%bpl - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r10d,%r12d - addl 36(%r15),%r9d - addb %dl,%bl - movl 0(%rdi,%rbp,4),%eax - addl $3951481745,%r9d - movzbl %bl,%ebx - xorl %r11d,%r12d - movl %edx,124(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $21,%r9d - movl $-1,%r12d - pinsrw $7,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movq %rbp,%rsi - xorq %rbp,%rbp - movb %sil,%bpl - movq %rcx,%rsi - xorq %rcx,%rcx - movb %sil,%cl - leaq (%rdi,%rbp,4),%rsi - psllq $8,%xmm1 - pxor %xmm0,%xmm5 - pxor %xmm1,%xmm5 - addl 0(%rsp),%r8d - addl 4(%rsp),%r9d - addl 8(%rsp),%r10d - addl 12(%rsp),%r11d - - movdqu %xmm2,(%r14,%r13,1) - movdqu %xmm3,16(%r14,%r13,1) - movdqu %xmm4,32(%r14,%r13,1) - movdqu %xmm5,48(%r14,%r13,1) - leaq 64(%r15),%r15 - leaq 64(%r13),%r13 - cmpq 16(%rsp),%r15 - jb .Loop - - movq 24(%rsp),%r12 - subb %al,%cl - movl %r8d,0(%r12) - movl %r9d,4(%r12) - movl %r10d,8(%r12) - movl %r11d,12(%r12) - subb $1,%bpl - movl %ebp,-8(%rdi) - movl %ecx,-4(%rdi) - - movq 40(%rsp),%r15 - movq 48(%rsp),%r14 - movq 56(%rsp),%r13 - movq 64(%rsp),%r12 - movq 72(%rsp),%rbp - movq 80(%rsp),%rbx - leaq 88(%rsp),%rsp -.Lepilogue: -.Labort: - .byte 0xf3,0xc3 -.size rc4_md5_enc,.-rc4_md5_enc -#endif diff --git a/packager/third_party/boringssl/linux-x86_64/crypto/rc4/rc4-x86_64.S b/packager/third_party/boringssl/linux-x86_64/crypto/rc4/rc4-x86_64.S deleted file mode 100644 index c4d10024ad..0000000000 --- a/packager/third_party/boringssl/linux-x86_64/crypto/rc4/rc4-x86_64.S +++ /dev/null @@ -1,596 +0,0 @@ -#if defined(__x86_64__) -.text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -.globl asm_RC4 -.hidden asm_RC4 -.type asm_RC4,@function -.align 16 -asm_RC4: - orq %rsi,%rsi - jne .Lentry - .byte 0xf3,0xc3 -.Lentry: - pushq %rbx - pushq %r12 - pushq %r13 -.Lprologue: - movq %rsi,%r11 - movq %rdx,%r12 - movq %rcx,%r13 - xorq %r10,%r10 - xorq %rcx,%rcx - - leaq 8(%rdi),%rdi - movb -8(%rdi),%r10b - movb -4(%rdi),%cl - cmpl $-1,256(%rdi) - je .LRC4_CHAR - movl OPENSSL_ia32cap_P(%rip),%r8d - xorq %rbx,%rbx - incb %r10b - subq %r10,%rbx - subq %r12,%r13 - movl (%rdi,%r10,4),%eax - testq $-16,%r11 - jz .Lloop1 - btl $30,%r8d - jc .Lintel - andq $7,%rbx - leaq 1(%r10),%rsi - jz .Loop8 - subq %rbx,%r11 -.Loop8_warmup: - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl %edx,(%rdi,%r10,4) - addb %dl,%al - incb %r10b - movl (%rdi,%rax,4),%edx - movl (%rdi,%r10,4),%eax - xorb (%r12),%dl - movb %dl,(%r12,%r13,1) - leaq 1(%r12),%r12 - decq %rbx - jnz .Loop8_warmup - - leaq 1(%r10),%rsi - jmp .Loop8 -.align 16 -.Loop8: - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl 0(%rdi,%rsi,4),%ebx - rorq $8,%r8 - movl %edx,0(%rdi,%r10,4) - addb %al,%dl - movb (%rdi,%rdx,4),%r8b - addb %bl,%cl - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - movl 4(%rdi,%rsi,4),%eax - rorq $8,%r8 - movl %edx,4(%rdi,%r10,4) - addb %bl,%dl - movb (%rdi,%rdx,4),%r8b - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl 8(%rdi,%rsi,4),%ebx - rorq $8,%r8 - movl %edx,8(%rdi,%r10,4) - addb %al,%dl - movb (%rdi,%rdx,4),%r8b - addb %bl,%cl - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - movl 12(%rdi,%rsi,4),%eax - rorq $8,%r8 - movl %edx,12(%rdi,%r10,4) - addb %bl,%dl - movb (%rdi,%rdx,4),%r8b - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl 16(%rdi,%rsi,4),%ebx - rorq $8,%r8 - movl %edx,16(%rdi,%r10,4) - addb %al,%dl - movb (%rdi,%rdx,4),%r8b - addb %bl,%cl - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - movl 20(%rdi,%rsi,4),%eax - rorq $8,%r8 - movl %edx,20(%rdi,%r10,4) - addb %bl,%dl - movb (%rdi,%rdx,4),%r8b - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl 24(%rdi,%rsi,4),%ebx - rorq $8,%r8 - movl %edx,24(%rdi,%r10,4) - addb %al,%dl - movb (%rdi,%rdx,4),%r8b - addb $8,%sil - addb %bl,%cl - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - movl -4(%rdi,%rsi,4),%eax - rorq $8,%r8 - movl %edx,28(%rdi,%r10,4) - addb %bl,%dl - movb (%rdi,%rdx,4),%r8b - addb $8,%r10b - rorq $8,%r8 - subq $8,%r11 - - xorq (%r12),%r8 - movq %r8,(%r12,%r13,1) - leaq 8(%r12),%r12 - - testq $-8,%r11 - jnz .Loop8 - cmpq $0,%r11 - jne .Lloop1 - jmp .Lexit - -.align 16 -.Lintel: - testq $-32,%r11 - jz .Lloop1 - andq $15,%rbx - jz .Loop16_is_hot - subq %rbx,%r11 -.Loop16_warmup: - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl %edx,(%rdi,%r10,4) - addb %dl,%al - incb %r10b - movl (%rdi,%rax,4),%edx - movl (%rdi,%r10,4),%eax - xorb (%r12),%dl - movb %dl,(%r12,%r13,1) - leaq 1(%r12),%r12 - decq %rbx - jnz .Loop16_warmup - - movq %rcx,%rbx - xorq %rcx,%rcx - movb %bl,%cl - -.Loop16_is_hot: - leaq (%rdi,%r10,4),%rsi - addb %al,%cl - movl (%rdi,%rcx,4),%edx - pxor %xmm0,%xmm0 - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 4(%rsi),%ebx - movzbl %al,%eax - movl %edx,0(%rsi) - addb %bl,%cl - pinsrw $0,(%rdi,%rax,4),%xmm0 - jmp .Loop16_enter -.align 16 -.Loop16: - addb %al,%cl - movl (%rdi,%rcx,4),%edx - pxor %xmm0,%xmm2 - psllq $8,%xmm1 - pxor %xmm0,%xmm0 - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 4(%rsi),%ebx - movzbl %al,%eax - movl %edx,0(%rsi) - pxor %xmm1,%xmm2 - addb %bl,%cl - pinsrw $0,(%rdi,%rax,4),%xmm0 - movdqu %xmm2,(%r12,%r13,1) - leaq 16(%r12),%r12 -.Loop16_enter: - movl (%rdi,%rcx,4),%edx - pxor %xmm1,%xmm1 - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 8(%rsi),%eax - movzbl %bl,%ebx - movl %edx,4(%rsi) - addb %al,%cl - pinsrw $0,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 12(%rsi),%ebx - movzbl %al,%eax - movl %edx,8(%rsi) - addb %bl,%cl - pinsrw $1,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 16(%rsi),%eax - movzbl %bl,%ebx - movl %edx,12(%rsi) - addb %al,%cl - pinsrw $1,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 20(%rsi),%ebx - movzbl %al,%eax - movl %edx,16(%rsi) - addb %bl,%cl - pinsrw $2,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 24(%rsi),%eax - movzbl %bl,%ebx - movl %edx,20(%rsi) - addb %al,%cl - pinsrw $2,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 28(%rsi),%ebx - movzbl %al,%eax - movl %edx,24(%rsi) - addb %bl,%cl - pinsrw $3,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 32(%rsi),%eax - movzbl %bl,%ebx - movl %edx,28(%rsi) - addb %al,%cl - pinsrw $3,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 36(%rsi),%ebx - movzbl %al,%eax - movl %edx,32(%rsi) - addb %bl,%cl - pinsrw $4,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 40(%rsi),%eax - movzbl %bl,%ebx - movl %edx,36(%rsi) - addb %al,%cl - pinsrw $4,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 44(%rsi),%ebx - movzbl %al,%eax - movl %edx,40(%rsi) - addb %bl,%cl - pinsrw $5,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 48(%rsi),%eax - movzbl %bl,%ebx - movl %edx,44(%rsi) - addb %al,%cl - pinsrw $5,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 52(%rsi),%ebx - movzbl %al,%eax - movl %edx,48(%rsi) - addb %bl,%cl - pinsrw $6,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 56(%rsi),%eax - movzbl %bl,%ebx - movl %edx,52(%rsi) - addb %al,%cl - pinsrw $6,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 60(%rsi),%ebx - movzbl %al,%eax - movl %edx,56(%rsi) - addb %bl,%cl - pinsrw $7,(%rdi,%rax,4),%xmm0 - addb $16,%r10b - movdqu (%r12),%xmm2 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movzbl %bl,%ebx - movl %edx,60(%rsi) - leaq (%rdi,%r10,4),%rsi - pinsrw $7,(%rdi,%rbx,4),%xmm1 - movl (%rsi),%eax - movq %rcx,%rbx - xorq %rcx,%rcx - subq $16,%r11 - movb %bl,%cl - testq $-16,%r11 - jnz .Loop16 - - psllq $8,%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm1,%xmm2 - movdqu %xmm2,(%r12,%r13,1) - leaq 16(%r12),%r12 - - cmpq $0,%r11 - jne .Lloop1 - jmp .Lexit - -.align 16 -.Lloop1: - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl %edx,(%rdi,%r10,4) - addb %dl,%al - incb %r10b - movl (%rdi,%rax,4),%edx - movl (%rdi,%r10,4),%eax - xorb (%r12),%dl - movb %dl,(%r12,%r13,1) - leaq 1(%r12),%r12 - decq %r11 - jnz .Lloop1 - jmp .Lexit - -.align 16 -.LRC4_CHAR: - addb $1,%r10b - movzbl (%rdi,%r10,1),%eax - testq $-8,%r11 - jz .Lcloop1 - jmp .Lcloop8 -.align 16 -.Lcloop8: - movl (%r12),%r8d - movl 4(%r12),%r9d - addb %al,%cl - leaq 1(%r10),%rsi - movzbl (%rdi,%rcx,1),%edx - movzbl %sil,%esi - movzbl (%rdi,%rsi,1),%ebx - movb %al,(%rdi,%rcx,1) - cmpq %rsi,%rcx - movb %dl,(%rdi,%r10,1) - jne .Lcmov0 - movq %rax,%rbx -.Lcmov0: - addb %al,%dl - xorb (%rdi,%rdx,1),%r8b - rorl $8,%r8d - addb %bl,%cl - leaq 1(%rsi),%r10 - movzbl (%rdi,%rcx,1),%edx - movzbl %r10b,%r10d - movzbl (%rdi,%r10,1),%eax - movb %bl,(%rdi,%rcx,1) - cmpq %r10,%rcx - movb %dl,(%rdi,%rsi,1) - jne .Lcmov1 - movq %rbx,%rax -.Lcmov1: - addb %bl,%dl - xorb (%rdi,%rdx,1),%r8b - rorl $8,%r8d - addb %al,%cl - leaq 1(%r10),%rsi - movzbl (%rdi,%rcx,1),%edx - movzbl %sil,%esi - movzbl (%rdi,%rsi,1),%ebx - movb %al,(%rdi,%rcx,1) - cmpq %rsi,%rcx - movb %dl,(%rdi,%r10,1) - jne .Lcmov2 - movq %rax,%rbx -.Lcmov2: - addb %al,%dl - xorb (%rdi,%rdx,1),%r8b - rorl $8,%r8d - addb %bl,%cl - leaq 1(%rsi),%r10 - movzbl (%rdi,%rcx,1),%edx - movzbl %r10b,%r10d - movzbl (%rdi,%r10,1),%eax - movb %bl,(%rdi,%rcx,1) - cmpq %r10,%rcx - movb %dl,(%rdi,%rsi,1) - jne .Lcmov3 - movq %rbx,%rax -.Lcmov3: - addb %bl,%dl - xorb (%rdi,%rdx,1),%r8b - rorl $8,%r8d - addb %al,%cl - leaq 1(%r10),%rsi - movzbl (%rdi,%rcx,1),%edx - movzbl %sil,%esi - movzbl (%rdi,%rsi,1),%ebx - movb %al,(%rdi,%rcx,1) - cmpq %rsi,%rcx - movb %dl,(%rdi,%r10,1) - jne .Lcmov4 - movq %rax,%rbx -.Lcmov4: - addb %al,%dl - xorb (%rdi,%rdx,1),%r9b - rorl $8,%r9d - addb %bl,%cl - leaq 1(%rsi),%r10 - movzbl (%rdi,%rcx,1),%edx - movzbl %r10b,%r10d - movzbl (%rdi,%r10,1),%eax - movb %bl,(%rdi,%rcx,1) - cmpq %r10,%rcx - movb %dl,(%rdi,%rsi,1) - jne .Lcmov5 - movq %rbx,%rax -.Lcmov5: - addb %bl,%dl - xorb (%rdi,%rdx,1),%r9b - rorl $8,%r9d - addb %al,%cl - leaq 1(%r10),%rsi - movzbl (%rdi,%rcx,1),%edx - movzbl %sil,%esi - movzbl (%rdi,%rsi,1),%ebx - movb %al,(%rdi,%rcx,1) - cmpq %rsi,%rcx - movb %dl,(%rdi,%r10,1) - jne .Lcmov6 - movq %rax,%rbx -.Lcmov6: - addb %al,%dl - xorb (%rdi,%rdx,1),%r9b - rorl $8,%r9d - addb %bl,%cl - leaq 1(%rsi),%r10 - movzbl (%rdi,%rcx,1),%edx - movzbl %r10b,%r10d - movzbl (%rdi,%r10,1),%eax - movb %bl,(%rdi,%rcx,1) - cmpq %r10,%rcx - movb %dl,(%rdi,%rsi,1) - jne .Lcmov7 - movq %rbx,%rax -.Lcmov7: - addb %bl,%dl - xorb (%rdi,%rdx,1),%r9b - rorl $8,%r9d - leaq -8(%r11),%r11 - movl %r8d,(%r13) - leaq 8(%r12),%r12 - movl %r9d,4(%r13) - leaq 8(%r13),%r13 - - testq $-8,%r11 - jnz .Lcloop8 - cmpq $0,%r11 - jne .Lcloop1 - jmp .Lexit -.align 16 -.Lcloop1: - addb %al,%cl - movzbl %cl,%ecx - movzbl (%rdi,%rcx,1),%edx - movb %al,(%rdi,%rcx,1) - movb %dl,(%rdi,%r10,1) - addb %al,%dl - addb $1,%r10b - movzbl %dl,%edx - movzbl %r10b,%r10d - movzbl (%rdi,%rdx,1),%edx - movzbl (%rdi,%r10,1),%eax - xorb (%r12),%dl - leaq 1(%r12),%r12 - movb %dl,(%r13) - leaq 1(%r13),%r13 - subq $1,%r11 - jnz .Lcloop1 - jmp .Lexit - -.align 16 -.Lexit: - subb $1,%r10b - movl %r10d,-8(%rdi) - movl %ecx,-4(%rdi) - - movq (%rsp),%r13 - movq 8(%rsp),%r12 - movq 16(%rsp),%rbx - addq $24,%rsp -.Lepilogue: - .byte 0xf3,0xc3 -.size asm_RC4,.-asm_RC4 -.globl asm_RC4_set_key -.hidden asm_RC4_set_key -.type asm_RC4_set_key,@function -.align 16 -asm_RC4_set_key: - leaq 8(%rdi),%rdi - leaq (%rdx,%rsi,1),%rdx - negq %rsi - movq %rsi,%rcx - xorl %eax,%eax - xorq %r9,%r9 - xorq %r10,%r10 - xorq %r11,%r11 - - movl OPENSSL_ia32cap_P(%rip),%r8d - btl $20,%r8d - jc .Lc1stloop - jmp .Lw1stloop - -.align 16 -.Lw1stloop: - movl %eax,(%rdi,%rax,4) - addb $1,%al - jnc .Lw1stloop - - xorq %r9,%r9 - xorq %r8,%r8 -.align 16 -.Lw2ndloop: - movl (%rdi,%r9,4),%r10d - addb (%rdx,%rsi,1),%r8b - addb %r10b,%r8b - addq $1,%rsi - movl (%rdi,%r8,4),%r11d - cmovzq %rcx,%rsi - movl %r10d,(%rdi,%r8,4) - movl %r11d,(%rdi,%r9,4) - addb $1,%r9b - jnc .Lw2ndloop - jmp .Lexit_key - -.align 16 -.Lc1stloop: - movb %al,(%rdi,%rax,1) - addb $1,%al - jnc .Lc1stloop - - xorq %r9,%r9 - xorq %r8,%r8 -.align 16 -.Lc2ndloop: - movb (%rdi,%r9,1),%r10b - addb (%rdx,%rsi,1),%r8b - addb %r10b,%r8b - addq $1,%rsi - movb (%rdi,%r8,1),%r11b - jnz .Lcnowrap - movq %rcx,%rsi -.Lcnowrap: - movb %r10b,(%rdi,%r8,1) - movb %r11b,(%rdi,%r9,1) - addb $1,%r9b - jnc .Lc2ndloop - movl $-1,256(%rdi) - -.align 16 -.Lexit_key: - xorl %eax,%eax - movl %eax,-8(%rdi) - movl %eax,-4(%rdi) - .byte 0xf3,0xc3 -.size asm_RC4_set_key,.-asm_RC4_set_key -#endif diff --git a/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S b/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S index 5de98a3d61..e87467caf0 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/chacha/chacha-x86.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "chacha-x86.S" .text .globl _ChaCha20_ctr32 .private_extern _ChaCha20_ctr32 diff --git a/packager/third_party/boringssl/mac-x86/crypto/aes/aes-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S similarity index 99% rename from packager/third_party/boringssl/mac-x86/crypto/aes/aes-586.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S index d3dc6beb6a..4046251d3a 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/aes/aes-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aes-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "aes-586.S" .text .private_extern __x86_AES_encrypt_compact .align 4 diff --git a/packager/third_party/boringssl/mac-x86/crypto/aes/aesni-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S similarity index 99% rename from packager/third_party/boringssl/mac-x86/crypto/aes/aesni-x86.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S index 07719ba7ae..3fe0e7543f 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/aes/aesni-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/aesni-x86.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "src/crypto/aes/asm/aesni-x86.S" .text .globl _aesni_encrypt .private_extern _aesni_encrypt diff --git a/packager/third_party/boringssl/mac-x86/crypto/bn/bn-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S similarity index 99% rename from packager/third_party/boringssl/mac-x86/crypto/bn/bn-586.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S index 0f0a94ece3..d1be040546 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/bn/bn-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/bn-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "src/crypto/bn/asm/bn-586.S" .text .globl _bn_mul_add_words .private_extern _bn_mul_add_words diff --git a/packager/third_party/boringssl/mac-x86/crypto/bn/co-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S similarity index 99% rename from packager/third_party/boringssl/mac-x86/crypto/bn/co-586.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S index 7ce8e79425..858ba3743e 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/bn/co-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/co-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "src/crypto/bn/asm/co-586.S" .text .globl _bn_mul_comba8 .private_extern _bn_mul_comba8 diff --git a/packager/third_party/boringssl/mac-x86/crypto/modes/ghash-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S similarity index 81% rename from packager/third_party/boringssl/mac-x86/crypto/modes/ghash-x86.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S index 8693b82a13..320cd42b1a 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/modes/ghash-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/ghash-x86.S @@ -1,207 +1,5 @@ #if defined(__i386__) -.file "ghash-x86.S" .text -.globl _gcm_gmult_4bit_x86 -.private_extern _gcm_gmult_4bit_x86 -.align 4 -_gcm_gmult_4bit_x86: -L_gcm_gmult_4bit_x86_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - subl $84,%esp - movl 104(%esp),%edi - movl 108(%esp),%esi - movl (%edi),%ebp - movl 4(%edi),%edx - movl 8(%edi),%ecx - movl 12(%edi),%ebx - movl $0,16(%esp) - movl $471859200,20(%esp) - movl $943718400,24(%esp) - movl $610271232,28(%esp) - movl $1887436800,32(%esp) - movl $1822425088,36(%esp) - movl $1220542464,40(%esp) - movl $1423966208,44(%esp) - movl $3774873600,48(%esp) - movl $4246732800,52(%esp) - movl $3644850176,56(%esp) - movl $3311403008,60(%esp) - movl $2441084928,64(%esp) - movl $2376073216,68(%esp) - movl $2847932416,72(%esp) - movl $3051356160,76(%esp) - movl %ebp,(%esp) - movl %edx,4(%esp) - movl %ecx,8(%esp) - movl %ebx,12(%esp) - shrl $20,%ebx - andl $240,%ebx - movl 4(%esi,%ebx,1),%ebp - movl (%esi,%ebx,1),%edx - movl 12(%esi,%ebx,1),%ecx - movl 8(%esi,%ebx,1),%ebx - xorl %eax,%eax - movl $15,%edi - jmp L000x86_loop -.align 4,0x90 -L000x86_loop: - movb %bl,%al - shrdl $4,%ecx,%ebx - andb $15,%al - shrdl $4,%edx,%ecx - shrdl $4,%ebp,%edx - shrl $4,%ebp - xorl 16(%esp,%eax,4),%ebp - movb (%esp,%edi,1),%al - andb $240,%al - xorl 8(%esi,%eax,1),%ebx - xorl 12(%esi,%eax,1),%ecx - xorl (%esi,%eax,1),%edx - xorl 4(%esi,%eax,1),%ebp - decl %edi - js L001x86_break - movb %bl,%al - shrdl $4,%ecx,%ebx - andb $15,%al - shrdl $4,%edx,%ecx - shrdl $4,%ebp,%edx - shrl $4,%ebp - xorl 16(%esp,%eax,4),%ebp - movb (%esp,%edi,1),%al - shlb $4,%al - xorl 8(%esi,%eax,1),%ebx - xorl 12(%esi,%eax,1),%ecx - xorl (%esi,%eax,1),%edx - xorl 4(%esi,%eax,1),%ebp - jmp L000x86_loop -.align 4,0x90 -L001x86_break: - bswap %ebx - bswap %ecx - bswap %edx - bswap %ebp - movl 104(%esp),%edi - movl %ebx,12(%edi) - movl %ecx,8(%edi) - movl %edx,4(%edi) - movl %ebp,(%edi) - addl $84,%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _gcm_ghash_4bit_x86 -.private_extern _gcm_ghash_4bit_x86 -.align 4 -_gcm_ghash_4bit_x86: -L_gcm_ghash_4bit_x86_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - subl $84,%esp - movl 104(%esp),%ebx - movl 108(%esp),%esi - movl 112(%esp),%edi - movl 116(%esp),%ecx - addl %edi,%ecx - movl %ecx,116(%esp) - movl (%ebx),%ebp - movl 4(%ebx),%edx - movl 8(%ebx),%ecx - movl 12(%ebx),%ebx - movl $0,16(%esp) - movl $471859200,20(%esp) - movl $943718400,24(%esp) - movl $610271232,28(%esp) - movl $1887436800,32(%esp) - movl $1822425088,36(%esp) - movl $1220542464,40(%esp) - movl $1423966208,44(%esp) - movl $3774873600,48(%esp) - movl $4246732800,52(%esp) - movl $3644850176,56(%esp) - movl $3311403008,60(%esp) - movl $2441084928,64(%esp) - movl $2376073216,68(%esp) - movl $2847932416,72(%esp) - movl $3051356160,76(%esp) -.align 4,0x90 -L002x86_outer_loop: - xorl 12(%edi),%ebx - xorl 8(%edi),%ecx - xorl 4(%edi),%edx - xorl (%edi),%ebp - movl %ebx,12(%esp) - movl %ecx,8(%esp) - movl %edx,4(%esp) - movl %ebp,(%esp) - shrl $20,%ebx - andl $240,%ebx - movl 4(%esi,%ebx,1),%ebp - movl (%esi,%ebx,1),%edx - movl 12(%esi,%ebx,1),%ecx - movl 8(%esi,%ebx,1),%ebx - xorl %eax,%eax - movl $15,%edi - jmp L003x86_loop -.align 4,0x90 -L003x86_loop: - movb %bl,%al - shrdl $4,%ecx,%ebx - andb $15,%al - shrdl $4,%edx,%ecx - shrdl $4,%ebp,%edx - shrl $4,%ebp - xorl 16(%esp,%eax,4),%ebp - movb (%esp,%edi,1),%al - andb $240,%al - xorl 8(%esi,%eax,1),%ebx - xorl 12(%esi,%eax,1),%ecx - xorl (%esi,%eax,1),%edx - xorl 4(%esi,%eax,1),%ebp - decl %edi - js L004x86_break - movb %bl,%al - shrdl $4,%ecx,%ebx - andb $15,%al - shrdl $4,%edx,%ecx - shrdl $4,%ebp,%edx - shrl $4,%ebp - xorl 16(%esp,%eax,4),%ebp - movb (%esp,%edi,1),%al - shlb $4,%al - xorl 8(%esi,%eax,1),%ebx - xorl 12(%esi,%eax,1),%ecx - xorl (%esi,%eax,1),%edx - xorl 4(%esi,%eax,1),%ebp - jmp L003x86_loop -.align 4,0x90 -L004x86_break: - bswap %ebx - bswap %ecx - bswap %edx - bswap %ebp - movl 112(%esp),%edi - leal 16(%edi),%edi - cmpl 116(%esp),%edi - movl %edi,112(%esp) - jb L002x86_outer_loop - movl 104(%esp),%edi - movl %ebx,12(%edi) - movl %ecx,8(%edi) - movl %edx,4(%edi) - movl %ebp,(%edi) - addl $84,%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret .globl _gcm_gmult_4bit_mmx .private_extern _gcm_gmult_4bit_mmx .align 4 @@ -213,10 +11,10 @@ L_gcm_gmult_4bit_mmx_begin: pushl %edi movl 20(%esp),%edi movl 24(%esp),%esi - call L005pic_point -L005pic_point: + call L000pic_point +L000pic_point: popl %eax - leal Lrem_4bit-L005pic_point(%eax),%eax + leal Lrem_4bit-L000pic_point(%eax),%eax movzbl 15(%edi),%ebx xorl %ecx,%ecx movl %ebx,%edx @@ -227,9 +25,9 @@ L005pic_point: movq 8(%esi,%ecx,1),%mm0 movq (%esi,%ecx,1),%mm1 movd %mm0,%ebx - jmp L006mmx_loop + jmp L001mmx_loop .align 4,0x90 -L006mmx_loop: +L001mmx_loop: psrlq $4,%mm0 andl $15,%ebx movq %mm1,%mm2 @@ -243,7 +41,7 @@ L006mmx_loop: pxor (%esi,%edx,1),%mm1 movl %ecx,%edx pxor %mm2,%mm0 - js L007mmx_break + js L002mmx_break shlb $4,%cl andl $15,%ebx psrlq $4,%mm0 @@ -256,9 +54,9 @@ L006mmx_loop: movd %mm0,%ebx pxor (%esi,%ecx,1),%mm1 pxor %mm2,%mm0 - jmp L006mmx_loop + jmp L001mmx_loop .align 4,0x90 -L007mmx_break: +L002mmx_break: shlb $4,%cl andl $15,%ebx psrlq $4,%mm0 @@ -314,10 +112,10 @@ L_gcm_ghash_4bit_mmx_begin: movl 28(%esp),%ecx movl 32(%esp),%edx movl %esp,%ebp - call L008pic_point -L008pic_point: + call L003pic_point +L003pic_point: popl %esi - leal Lrem_8bit-L008pic_point(%esi),%esi + leal Lrem_8bit-L003pic_point(%esi),%esi subl $544,%esp andl $-64,%esp subl $16,%esp @@ -556,7 +354,7 @@ L008pic_point: movl 8(%eax),%ebx movl 12(%eax),%edx .align 4,0x90 -L009outer: +L004outer: xorl 12(%ecx),%edx xorl 8(%ecx),%ebx pxor (%ecx),%mm6 @@ -891,7 +689,7 @@ L009outer: pshufw $27,%mm6,%mm6 bswap %ebx cmpl 552(%esp),%ecx - jne L009outer + jne L004outer movl 544(%esp),%eax movl %edx,12(%eax) movl %ebx,8(%eax) @@ -910,10 +708,10 @@ _gcm_init_clmul: L_gcm_init_clmul_begin: movl 4(%esp),%edx movl 8(%esp),%eax - call L010pic -L010pic: + call L005pic +L005pic: popl %ecx - leal Lbswap-L010pic(%ecx),%ecx + leal Lbswap-L005pic(%ecx),%ecx movdqu (%eax),%xmm2 pshufd $78,%xmm2,%xmm2 pshufd $255,%xmm2,%xmm4 @@ -978,10 +776,10 @@ _gcm_gmult_clmul: L_gcm_gmult_clmul_begin: movl 4(%esp),%eax movl 8(%esp),%edx - call L011pic -L011pic: + call L006pic +L006pic: popl %ecx - leal Lbswap-L011pic(%ecx),%ecx + leal Lbswap-L006pic(%ecx),%ecx movdqu (%eax),%xmm0 movdqa (%ecx),%xmm5 movups (%edx),%xmm2 @@ -1036,16 +834,16 @@ L_gcm_ghash_clmul_begin: movl 24(%esp),%edx movl 28(%esp),%esi movl 32(%esp),%ebx - call L012pic -L012pic: + call L007pic +L007pic: popl %ecx - leal Lbswap-L012pic(%ecx),%ecx + leal Lbswap-L007pic(%ecx),%ecx movdqu (%eax),%xmm0 movdqa (%ecx),%xmm5 movdqu (%edx),%xmm2 .byte 102,15,56,0,197 subl $16,%ebx - jz L013odd_tail + jz L008odd_tail movdqu (%esi),%xmm3 movdqu 16(%esi),%xmm6 .byte 102,15,56,0,221 @@ -1062,10 +860,10 @@ L012pic: movups 16(%edx),%xmm2 nop subl $32,%ebx - jbe L014even_tail - jmp L015mod_loop + jbe L009even_tail + jmp L010mod_loop .align 5,0x90 -L015mod_loop: +L010mod_loop: pshufd $78,%xmm0,%xmm4 movdqa %xmm0,%xmm1 pxor %xmm0,%xmm4 @@ -1120,8 +918,8 @@ L015mod_loop: .byte 102,15,58,68,221,0 leal 32(%esi),%esi subl $32,%ebx - ja L015mod_loop -L014even_tail: + ja L010mod_loop +L009even_tail: pshufd $78,%xmm0,%xmm4 movdqa %xmm0,%xmm1 pxor %xmm0,%xmm4 @@ -1160,9 +958,9 @@ L014even_tail: psrlq $1,%xmm0 pxor %xmm1,%xmm0 testl %ebx,%ebx - jnz L016done + jnz L011done movups (%edx),%xmm2 -L013odd_tail: +L008odd_tail: movdqu (%esi),%xmm3 .byte 102,15,56,0,221 pxor %xmm3,%xmm0 @@ -1201,7 +999,7 @@ L013odd_tail: pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 -L016done: +L011done: .byte 102,15,56,0,197 movdqu %xmm0,(%eax) popl %edi diff --git a/packager/third_party/boringssl/mac-x86/crypto/md5/md5-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S similarity index 99% rename from packager/third_party/boringssl/mac-x86/crypto/md5/md5-586.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S index 6830b16410..795e42e5c8 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/md5/md5-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/md5-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "src/crypto/md5/asm/md5-586.S" .text .globl _md5_block_asm_data_order .private_extern _md5_block_asm_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/sha/sha1-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S similarity index 99% rename from packager/third_party/boringssl/mac-x86/crypto/sha/sha1-586.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S index 72a7205dea..efb6f52e32 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/sha/sha1-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha1-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "sha1-586.S" .text .globl _sha1_block_data_order .private_extern _sha1_block_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/sha/sha256-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S similarity index 99% rename from packager/third_party/boringssl/mac-x86/crypto/sha/sha256-586.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S index 841854f7a9..7f15397e15 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/sha/sha256-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha256-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "sha512-586.S" .text .globl _sha256_block_data_order .private_extern _sha256_block_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/sha/sha512-586.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S similarity index 99% rename from packager/third_party/boringssl/mac-x86/crypto/sha/sha512-586.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S index 3066100e6f..f65cb1086a 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/sha/sha512-586.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/sha512-586.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "sha512-586.S" .text .globl _sha512_block_data_order .private_extern _sha512_block_data_order diff --git a/packager/third_party/boringssl/mac-x86/crypto/aes/vpaes-x86.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S similarity index 99% rename from packager/third_party/boringssl/mac-x86/crypto/aes/vpaes-x86.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S index 8b85709dee..f49e9f0a81 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/aes/vpaes-x86.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/vpaes-x86.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "vpaes-x86.S" .text .align 6,0x90 L_vpaes_consts: diff --git a/packager/third_party/boringssl/mac-x86/crypto/bn/x86-mont.S b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S similarity index 86% rename from packager/third_party/boringssl/mac-x86/crypto/bn/x86-mont.S rename to packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S index 234034b0a0..e7353ae252 100644 --- a/packager/third_party/boringssl/mac-x86/crypto/bn/x86-mont.S +++ b/packager/third_party/boringssl/mac-x86/crypto/fipsmodule/x86-mont.S @@ -1,5 +1,4 @@ #if defined(__i386__) -.file "src/crypto/bn/asm/x86-mont.S" .text .globl _bn_mul_mont .private_extern _bn_mul_mont @@ -16,39 +15,54 @@ L_bn_mul_mont_begin: jl L000just_leave leal 20(%esp),%esi leal 24(%esp),%edx - movl %esp,%ebp addl $2,%edi negl %edi - leal -32(%esp,%edi,4),%esp + leal -32(%esp,%edi,4),%ebp negl %edi - movl %esp,%eax + movl %ebp,%eax subl %edx,%eax andl $2047,%eax - subl %eax,%esp - xorl %esp,%edx + subl %eax,%ebp + xorl %ebp,%edx andl $2048,%edx xorl $2048,%edx - subl %edx,%esp - andl $-64,%esp + subl %edx,%ebp + andl $-64,%ebp + movl %esp,%eax + subl %ebp,%eax + andl $-4096,%eax + movl %esp,%edx + leal (%ebp,%eax,1),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja L001page_walk + jmp L002page_walk_done +.align 4,0x90 +L001page_walk: + leal -4096(%esp),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja L001page_walk +L002page_walk_done: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx - movl 12(%esi),%edx + movl 12(%esi),%ebp movl 16(%esi),%esi movl (%esi),%esi movl %eax,4(%esp) movl %ebx,8(%esp) movl %ecx,12(%esp) - movl %edx,16(%esp) + movl %ebp,16(%esp) movl %esi,20(%esp) leal -3(%edi),%ebx - movl %ebp,24(%esp) - call L001PIC_me_up -L001PIC_me_up: + movl %edx,24(%esp) + call L003PIC_me_up +L003PIC_me_up: popl %eax - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001PIC_me_up(%eax),%eax + movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax btl $26,(%eax) - jnc L002non_sse2 + jnc L004non_sse2 movl $-1,%eax movd %eax,%mm7 movl 8(%esp),%esi @@ -72,7 +86,7 @@ L001PIC_me_up: psrlq $32,%mm3 incl %ecx .align 4,0x90 -L0031st: +L0051st: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -87,7 +101,7 @@ L0031st: psrlq $32,%mm3 leal 1(%ecx),%ecx cmpl %ebx,%ecx - jl L0031st + jl L0051st pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -101,7 +115,7 @@ L0031st: paddq %mm2,%mm3 movq %mm3,32(%esp,%ebx,4) incl %edx -L004outer: +L006outer: xorl %ecx,%ecx movd (%edi,%edx,4),%mm4 movd (%esi),%mm5 @@ -123,7 +137,7 @@ L004outer: paddq %mm6,%mm2 incl %ecx decl %ebx -L005inner: +L007inner: pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 paddq %mm0,%mm2 @@ -140,7 +154,7 @@ L005inner: paddq %mm6,%mm2 decl %ebx leal 1(%ecx),%ecx - jnz L005inner + jnz L007inner movl %ecx,%ebx pmuludq %mm4,%mm0 pmuludq %mm5,%mm1 @@ -158,11 +172,11 @@ L005inner: movq %mm3,32(%esp,%ebx,4) leal 1(%edx),%edx cmpl %ebx,%edx - jle L004outer + jle L006outer emms - jmp L006common_tail + jmp L008common_tail .align 4,0x90 -L002non_sse2: +L004non_sse2: movl 8(%esp),%esi leal 1(%ebx),%ebp movl 12(%esp),%edi @@ -173,12 +187,12 @@ L002non_sse2: leal 4(%edi,%ebx,4),%eax orl %edx,%ebp movl (%edi),%edi - jz L007bn_sqr_mont + jz L009bn_sqr_mont movl %eax,28(%esp) movl (%esi),%eax xorl %edx,%edx .align 4,0x90 -L008mull: +L010mull: movl %edx,%ebp mull %edi addl %eax,%ebp @@ -187,7 +201,7 @@ L008mull: movl (%esi,%ecx,4),%eax cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl L008mull + jl L010mull movl %edx,%ebp mull %edi movl 20(%esp),%edi @@ -205,9 +219,9 @@ L008mull: movl 4(%esi),%eax adcl $0,%edx incl %ecx - jmp L0092ndmadd + jmp L0112ndmadd .align 4,0x90 -L0101stmadd: +L0121stmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -218,7 +232,7 @@ L0101stmadd: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,28(%esp,%ecx,4) - jl L0101stmadd + jl L0121stmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%eax @@ -241,7 +255,7 @@ L0101stmadd: adcl $0,%edx movl $1,%ecx .align 4,0x90 -L0092ndmadd: +L0112ndmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -252,7 +266,7 @@ L0092ndmadd: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl L0092ndmadd + jl L0112ndmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -268,16 +282,16 @@ L0092ndmadd: movl %edx,32(%esp,%ebx,4) cmpl 28(%esp),%ecx movl %eax,36(%esp,%ebx,4) - je L006common_tail + je L008common_tail movl (%ecx),%edi movl 8(%esp),%esi movl %ecx,12(%esp) xorl %ecx,%ecx xorl %edx,%edx movl (%esi),%eax - jmp L0101stmadd + jmp L0121stmadd .align 4,0x90 -L007bn_sqr_mont: +L009bn_sqr_mont: movl %ebx,(%esp) movl %ecx,12(%esp) movl %edi,%eax @@ -288,7 +302,7 @@ L007bn_sqr_mont: andl $1,%ebx incl %ecx .align 4,0x90 -L011sqr: +L013sqr: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -300,7 +314,7 @@ L011sqr: cmpl (%esp),%ecx movl %eax,%ebx movl %ebp,28(%esp,%ecx,4) - jl L011sqr + jl L013sqr movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -324,7 +338,7 @@ L011sqr: movl 4(%esi),%eax movl $1,%ecx .align 4,0x90 -L0123rdmadd: +L0143rdmadd: movl %edx,%ebp mull %edi addl 32(%esp,%ecx,4),%ebp @@ -343,7 +357,7 @@ L0123rdmadd: adcl $0,%edx cmpl %ebx,%ecx movl %ebp,24(%esp,%ecx,4) - jl L0123rdmadd + jl L0143rdmadd movl %edx,%ebp mull %edi addl 32(%esp,%ebx,4),%ebp @@ -359,7 +373,7 @@ L0123rdmadd: movl %edx,32(%esp,%ebx,4) cmpl %ebx,%ecx movl %eax,36(%esp,%ebx,4) - je L006common_tail + je L008common_tail movl 4(%esi,%ecx,4),%edi leal 1(%ecx),%ecx movl %edi,%eax @@ -371,12 +385,12 @@ L0123rdmadd: xorl %ebp,%ebp cmpl %ebx,%ecx leal 1(%ecx),%ecx - je L013sqrlast + je L015sqrlast movl %edx,%ebx shrl $1,%edx andl $1,%ebx .align 4,0x90 -L014sqradd: +L016sqradd: movl (%esi,%ecx,4),%eax movl %edx,%ebp mull %edi @@ -392,13 +406,13 @@ L014sqradd: cmpl (%esp),%ecx movl %ebp,28(%esp,%ecx,4) movl %eax,%ebx - jle L014sqradd + jle L016sqradd movl %edx,%ebp addl %edx,%edx shrl $31,%ebp addl %ebx,%edx adcl $0,%ebp -L013sqrlast: +L015sqrlast: movl 20(%esp),%edi movl 16(%esp),%esi imull 32(%esp),%edi @@ -413,9 +427,9 @@ L013sqrlast: adcl $0,%edx movl $1,%ecx movl 4(%esi),%eax - jmp L0123rdmadd + jmp L0143rdmadd .align 4,0x90 -L006common_tail: +L008common_tail: movl 16(%esp),%ebp movl 4(%esp),%edi leal 32(%esp),%esi @@ -423,25 +437,26 @@ L006common_tail: movl %ebx,%ecx xorl %edx,%edx .align 4,0x90 -L015sub: +L017sub: sbbl (%ebp,%edx,4),%eax movl %eax,(%edi,%edx,4) decl %ecx movl 4(%esi,%edx,4),%eax leal 1(%edx),%edx - jge L015sub + jge L017sub sbbl $0,%eax + andl %eax,%esi + notl %eax + movl %edi,%ebp + andl %eax,%ebp + orl %ebp,%esi .align 4,0x90 -L016copy: - movl (%esi,%ebx,4),%edx - movl (%edi,%ebx,4),%ebp - xorl %ebp,%edx - andl %eax,%edx - xorl %ebp,%edx - movl %ecx,(%esi,%ebx,4) - movl %edx,(%edi,%ebx,4) +L018copy: + movl (%esi,%ebx,4),%eax + movl %eax,(%edi,%ebx,4) + movl %ecx,32(%esp,%ebx,4) decl %ebx - jge L016copy + jge L018copy movl 24(%esp),%esp movl $1,%eax L000just_leave: diff --git a/packager/third_party/boringssl/mac-x86/crypto/rc4/rc4-586.S b/packager/third_party/boringssl/mac-x86/crypto/rc4/rc4-586.S deleted file mode 100644 index dcddc58388..0000000000 --- a/packager/third_party/boringssl/mac-x86/crypto/rc4/rc4-586.S +++ /dev/null @@ -1,350 +0,0 @@ -#if defined(__i386__) -.file "rc4-586.S" -.text -.globl _asm_RC4 -.private_extern _asm_RC4 -.align 4 -_asm_RC4: -L_asm_RC4_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%edi - movl 24(%esp),%edx - movl 28(%esp),%esi - movl 32(%esp),%ebp - xorl %eax,%eax - xorl %ebx,%ebx - cmpl $0,%edx - je L000abort - movb (%edi),%al - movb 4(%edi),%bl - addl $8,%edi - leal (%esi,%edx,1),%ecx - subl %esi,%ebp - movl %ecx,24(%esp) - incb %al - cmpl $-1,256(%edi) - je L001RC4_CHAR - movl (%edi,%eax,4),%ecx - andl $-4,%edx - jz L002loop1 - movl %ebp,32(%esp) - testl $-8,%edx - jz L003go4loop4 - call L004PIC_me_up -L004PIC_me_up: - popl %ebp - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L004PIC_me_up(%ebp),%ebp - btl $26,(%ebp) - jnc L003go4loop4 - movl 32(%esp),%ebp - andl $-8,%edx - leal -8(%esi,%edx,1),%edx - movl %edx,-4(%edi) - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - movq (%esi),%mm0 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm2 - jmp L005loop_mmx_enter -.align 4,0x90 -L006loop_mmx: - addb %cl,%bl - psllq $56,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movq (%esi),%mm0 - movq %mm2,-8(%ebp,%esi,1) - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm2 -L005loop_mmx_enter: - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm0,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $8,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $16,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $24,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $32,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $40,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - addb %cl,%bl - psllq $48,%mm1 - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - incl %eax - addl %ecx,%edx - movzbl %al,%eax - movzbl %dl,%edx - pxor %mm1,%mm2 - movl (%edi,%eax,4),%ecx - movd (%edi,%edx,4),%mm1 - movl %ebx,%edx - xorl %ebx,%ebx - movb %dl,%bl - cmpl -4(%edi),%esi - leal 8(%esi),%esi - jb L006loop_mmx - psllq $56,%mm1 - pxor %mm1,%mm2 - movq %mm2,-8(%ebp,%esi,1) - emms - cmpl 24(%esp),%esi - je L007done - jmp L002loop1 -.align 4,0x90 -L003go4loop4: - leal -4(%esi,%edx,1),%edx - movl %edx,28(%esp) -L008loop4: - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - addl %ecx,%edx - incb %al - andl $255,%edx - movl (%edi,%eax,4),%ecx - movl (%edi,%edx,4),%ebp - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - addl %ecx,%edx - incb %al - andl $255,%edx - rorl $8,%ebp - movl (%edi,%eax,4),%ecx - orl (%edi,%edx,4),%ebp - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - addl %ecx,%edx - incb %al - andl $255,%edx - rorl $8,%ebp - movl (%edi,%eax,4),%ecx - orl (%edi,%edx,4),%ebp - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - addl %ecx,%edx - incb %al - andl $255,%edx - rorl $8,%ebp - movl 32(%esp),%ecx - orl (%edi,%edx,4),%ebp - rorl $8,%ebp - xorl (%esi),%ebp - cmpl 28(%esp),%esi - movl %ebp,(%ecx,%esi,1) - leal 4(%esi),%esi - movl (%edi,%eax,4),%ecx - jb L008loop4 - cmpl 24(%esp),%esi - je L007done - movl 32(%esp),%ebp -.align 4,0x90 -L002loop1: - addb %cl,%bl - movl (%edi,%ebx,4),%edx - movl %ecx,(%edi,%ebx,4) - movl %edx,(%edi,%eax,4) - addl %ecx,%edx - incb %al - andl $255,%edx - movl (%edi,%edx,4),%edx - xorb (%esi),%dl - leal 1(%esi),%esi - movl (%edi,%eax,4),%ecx - cmpl 24(%esp),%esi - movb %dl,-1(%ebp,%esi,1) - jb L002loop1 - jmp L007done -.align 4,0x90 -L001RC4_CHAR: - movzbl (%edi,%eax,1),%ecx -L009cloop1: - addb %cl,%bl - movzbl (%edi,%ebx,1),%edx - movb %cl,(%edi,%ebx,1) - movb %dl,(%edi,%eax,1) - addb %cl,%dl - movzbl (%edi,%edx,1),%edx - addb $1,%al - xorb (%esi),%dl - leal 1(%esi),%esi - movzbl (%edi,%eax,1),%ecx - cmpl 24(%esp),%esi - movb %dl,-1(%ebp,%esi,1) - jb L009cloop1 -L007done: - decb %al - movl %ebx,-4(%edi) - movb %al,-8(%edi) -L000abort: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _asm_RC4_set_key -.private_extern _asm_RC4_set_key -.align 4 -_asm_RC4_set_key: -L_asm_RC4_set_key_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%edi - movl 24(%esp),%ebp - movl 28(%esp),%esi - call L010PIC_me_up -L010PIC_me_up: - popl %edx - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%edx),%edx - leal 8(%edi),%edi - leal (%esi,%ebp,1),%esi - negl %ebp - xorl %eax,%eax - movl %ebp,-4(%edi) - btl $20,(%edx) - jc L011c1stloop -.align 4,0x90 -L012w1stloop: - movl %eax,(%edi,%eax,4) - addb $1,%al - jnc L012w1stloop - xorl %ecx,%ecx - xorl %edx,%edx -.align 4,0x90 -L013w2ndloop: - movl (%edi,%ecx,4),%eax - addb (%esi,%ebp,1),%dl - addb %al,%dl - addl $1,%ebp - movl (%edi,%edx,4),%ebx - jnz L014wnowrap - movl -4(%edi),%ebp -L014wnowrap: - movl %eax,(%edi,%edx,4) - movl %ebx,(%edi,%ecx,4) - addb $1,%cl - jnc L013w2ndloop - jmp L015exit -.align 4,0x90 -L011c1stloop: - movb %al,(%edi,%eax,1) - addb $1,%al - jnc L011c1stloop - xorl %ecx,%ecx - xorl %edx,%edx - xorl %ebx,%ebx -.align 4,0x90 -L016c2ndloop: - movb (%edi,%ecx,1),%al - addb (%esi,%ebp,1),%dl - addb %al,%dl - addl $1,%ebp - movb (%edi,%edx,1),%bl - jnz L017cnowrap - movl -4(%edi),%ebp -L017cnowrap: - movb %al,(%edi,%edx,1) - movb %bl,(%edi,%ecx,1) - addb $1,%cl - jnc L016c2ndloop - movl $-1,256(%edi) -L015exit: - xorl %eax,%eax - movl %eax,-8(%edi) - movl %eax,-4(%edi) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.section __IMPORT,__pointers,non_lazy_symbol_pointers -L_OPENSSL_ia32cap_P$non_lazy_ptr: -.indirect_symbol _OPENSSL_ia32cap_P -.long 0 -#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/bn/rsaz-avx2.S b/packager/third_party/boringssl/mac-x86_64/crypto/bn/rsaz-avx2.S deleted file mode 100644 index 8ba2019a1c..0000000000 --- a/packager/third_party/boringssl/mac-x86_64/crypto/bn/rsaz-avx2.S +++ /dev/null @@ -1,34 +0,0 @@ -#if defined(__x86_64__) -.text - -.globl _rsaz_avx2_eligible -.private_extern _rsaz_avx2_eligible - -_rsaz_avx2_eligible: - xorl %eax,%eax - .byte 0xf3,0xc3 - - -.globl _rsaz_1024_sqr_avx2 -.private_extern _rsaz_1024_sqr_avx2 -.globl _rsaz_1024_mul_avx2 -.private_extern _rsaz_1024_mul_avx2 -.globl _rsaz_1024_norm2red_avx2 -.private_extern _rsaz_1024_norm2red_avx2 -.globl _rsaz_1024_red2norm_avx2 -.private_extern _rsaz_1024_red2norm_avx2 -.globl _rsaz_1024_scatter5_avx2 -.private_extern _rsaz_1024_scatter5_avx2 -.globl _rsaz_1024_gather5_avx2 -.private_extern _rsaz_1024_gather5_avx2 - -_rsaz_1024_sqr_avx2: -_rsaz_1024_mul_avx2: -_rsaz_1024_norm2red_avx2: -_rsaz_1024_red2norm_avx2: -_rsaz_1024_scatter5_avx2: -_rsaz_1024_gather5_avx2: -.byte 0x0f,0x0b - .byte 0xf3,0xc3 - -#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/bn/rsaz-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/bn/rsaz-x86_64.S deleted file mode 100644 index 337276f9cb..0000000000 --- a/packager/third_party/boringssl/mac-x86_64/crypto/bn/rsaz-x86_64.S +++ /dev/null @@ -1,1228 +0,0 @@ -#if defined(__x86_64__) -.text - - - -.globl _rsaz_512_sqr -.private_extern _rsaz_512_sqr - -.p2align 5 -_rsaz_512_sqr: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $128+24,%rsp -L$sqr_body: - movq %rdx,%rbp - movq (%rsi),%rdx - movq 8(%rsi),%rax - movq %rcx,128(%rsp) - jmp L$oop_sqr - -.p2align 5 -L$oop_sqr: - movl %r8d,128+8(%rsp) - - movq %rdx,%rbx - mulq %rdx - movq %rax,%r8 - movq 16(%rsi),%rax - movq %rdx,%r9 - - mulq %rbx - addq %rax,%r9 - movq 24(%rsi),%rax - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r10 - movq 32(%rsi),%rax - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r11 - movq 40(%rsi),%rax - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r12 - movq 48(%rsi),%rax - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r13 - movq 56(%rsi),%rax - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - addq %rax,%r14 - movq %rbx,%rax - movq %rdx,%r15 - adcq $0,%r15 - - addq %r8,%r8 - movq %r9,%rcx - adcq %r9,%r9 - - mulq %rax - movq %rax,(%rsp) - addq %rdx,%r8 - adcq $0,%r9 - - movq %r8,8(%rsp) - shrq $63,%rcx - - - movq 8(%rsi),%r8 - movq 16(%rsi),%rax - mulq %r8 - addq %rax,%r10 - movq 24(%rsi),%rax - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r8 - addq %rax,%r11 - movq 32(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r11 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r8 - addq %rax,%r12 - movq 40(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r12 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r8 - addq %rax,%r13 - movq 48(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r13 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r8 - addq %rax,%r14 - movq 56(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r14 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r8 - addq %rax,%r15 - movq %r8,%rax - adcq $0,%rdx - addq %rbx,%r15 - movq %rdx,%r8 - movq %r10,%rdx - adcq $0,%r8 - - addq %rdx,%rdx - leaq (%rcx,%r10,2),%r10 - movq %r11,%rbx - adcq %r11,%r11 - - mulq %rax - addq %rax,%r9 - adcq %rdx,%r10 - adcq $0,%r11 - - movq %r9,16(%rsp) - movq %r10,24(%rsp) - shrq $63,%rbx - - - movq 16(%rsi),%r9 - movq 24(%rsi),%rax - mulq %r9 - addq %rax,%r12 - movq 32(%rsi),%rax - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r9 - addq %rax,%r13 - movq 40(%rsi),%rax - adcq $0,%rdx - addq %rcx,%r13 - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r9 - addq %rax,%r14 - movq 48(%rsi),%rax - adcq $0,%rdx - addq %rcx,%r14 - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r9 - movq %r12,%r10 - leaq (%rbx,%r12,2),%r12 - addq %rax,%r15 - movq 56(%rsi),%rax - adcq $0,%rdx - addq %rcx,%r15 - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r9 - shrq $63,%r10 - addq %rax,%r8 - movq %r9,%rax - adcq $0,%rdx - addq %rcx,%r8 - movq %rdx,%r9 - adcq $0,%r9 - - movq %r13,%rcx - leaq (%r10,%r13,2),%r13 - - mulq %rax - addq %rax,%r11 - adcq %rdx,%r12 - adcq $0,%r13 - - movq %r11,32(%rsp) - movq %r12,40(%rsp) - shrq $63,%rcx - - - movq 24(%rsi),%r10 - movq 32(%rsi),%rax - mulq %r10 - addq %rax,%r14 - movq 40(%rsi),%rax - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r10 - addq %rax,%r15 - movq 48(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r15 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r10 - movq %r14,%r12 - leaq (%rcx,%r14,2),%r14 - addq %rax,%r8 - movq 56(%rsi),%rax - adcq $0,%rdx - addq %rbx,%r8 - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r10 - shrq $63,%r12 - addq %rax,%r9 - movq %r10,%rax - adcq $0,%rdx - addq %rbx,%r9 - movq %rdx,%r10 - adcq $0,%r10 - - movq %r15,%rbx - leaq (%r12,%r15,2),%r15 - - mulq %rax - addq %rax,%r13 - adcq %rdx,%r14 - adcq $0,%r15 - - movq %r13,48(%rsp) - movq %r14,56(%rsp) - shrq $63,%rbx - - - movq 32(%rsi),%r11 - movq 40(%rsi),%rax - mulq %r11 - addq %rax,%r8 - movq 48(%rsi),%rax - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r11 - addq %rax,%r9 - movq 56(%rsi),%rax - adcq $0,%rdx - movq %r8,%r12 - leaq (%rbx,%r8,2),%r8 - addq %rcx,%r9 - movq %rdx,%rcx - adcq $0,%rcx - - mulq %r11 - shrq $63,%r12 - addq %rax,%r10 - movq %r11,%rax - adcq $0,%rdx - addq %rcx,%r10 - movq %rdx,%r11 - adcq $0,%r11 - - movq %r9,%rcx - leaq (%r12,%r9,2),%r9 - - mulq %rax - addq %rax,%r15 - adcq %rdx,%r8 - adcq $0,%r9 - - movq %r15,64(%rsp) - movq %r8,72(%rsp) - shrq $63,%rcx - - - movq 40(%rsi),%r12 - movq 48(%rsi),%rax - mulq %r12 - addq %rax,%r10 - movq 56(%rsi),%rax - movq %rdx,%rbx - adcq $0,%rbx - - mulq %r12 - addq %rax,%r11 - movq %r12,%rax - movq %r10,%r15 - leaq (%rcx,%r10,2),%r10 - adcq $0,%rdx - shrq $63,%r15 - addq %rbx,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - movq %r11,%rbx - leaq (%r15,%r11,2),%r11 - - mulq %rax - addq %rax,%r9 - adcq %rdx,%r10 - adcq $0,%r11 - - movq %r9,80(%rsp) - movq %r10,88(%rsp) - - - movq 48(%rsi),%r13 - movq 56(%rsi),%rax - mulq %r13 - addq %rax,%r12 - movq %r13,%rax - movq %rdx,%r13 - adcq $0,%r13 - - xorq %r14,%r14 - shlq $1,%rbx - adcq %r12,%r12 - adcq %r13,%r13 - adcq %r14,%r14 - - mulq %rax - addq %rax,%r11 - adcq %rdx,%r12 - adcq $0,%r13 - - movq %r11,96(%rsp) - movq %r12,104(%rsp) - - - movq 56(%rsi),%rax - mulq %rax - addq %rax,%r13 - adcq $0,%rdx - - addq %rdx,%r14 - - movq %r13,112(%rsp) - movq %r14,120(%rsp) - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reduce - - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - movq %r8,%rdx - movq %r9,%rax - movl 128+8(%rsp),%r8d - movq %rdi,%rsi - - decl %r8d - jnz L$oop_sqr - - leaq 128+24+48(%rsp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$sqr_epilogue: - .byte 0xf3,0xc3 - -.globl _rsaz_512_mul -.private_extern _rsaz_512_mul - -.p2align 5 -_rsaz_512_mul: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $128+24,%rsp -L$mul_body: -.byte 102,72,15,110,199 -.byte 102,72,15,110,201 - movq %r8,128(%rsp) - movq (%rdx),%rbx - movq %rdx,%rbp - call __rsaz_512_mul - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reduce - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - leaq 128+24+48(%rsp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$mul_epilogue: - .byte 0xf3,0xc3 - -.globl _rsaz_512_mul_gather4 -.private_extern _rsaz_512_mul_gather4 - -.p2align 5 -_rsaz_512_mul_gather4: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $152,%rsp -L$mul_gather4_body: - movd %r9d,%xmm8 - movdqa L$inc+16(%rip),%xmm1 - movdqa L$inc(%rip),%xmm0 - - pshufd $0,%xmm8,%xmm8 - movdqa %xmm1,%xmm7 - movdqa %xmm1,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm8,%xmm0 - movdqa %xmm7,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm8,%xmm1 - movdqa %xmm7,%xmm4 - paddd %xmm2,%xmm3 - pcmpeqd %xmm8,%xmm2 - movdqa %xmm7,%xmm5 - paddd %xmm3,%xmm4 - pcmpeqd %xmm8,%xmm3 - movdqa %xmm7,%xmm6 - paddd %xmm4,%xmm5 - pcmpeqd %xmm8,%xmm4 - paddd %xmm5,%xmm6 - pcmpeqd %xmm8,%xmm5 - paddd %xmm6,%xmm7 - pcmpeqd %xmm8,%xmm6 - pcmpeqd %xmm8,%xmm7 - - movdqa 0(%rdx),%xmm8 - movdqa 16(%rdx),%xmm9 - movdqa 32(%rdx),%xmm10 - movdqa 48(%rdx),%xmm11 - pand %xmm0,%xmm8 - movdqa 64(%rdx),%xmm12 - pand %xmm1,%xmm9 - movdqa 80(%rdx),%xmm13 - pand %xmm2,%xmm10 - movdqa 96(%rdx),%xmm14 - pand %xmm3,%xmm11 - movdqa 112(%rdx),%xmm15 - leaq 128(%rdx),%rbp - pand %xmm4,%xmm12 - pand %xmm5,%xmm13 - pand %xmm6,%xmm14 - pand %xmm7,%xmm15 - por %xmm10,%xmm8 - por %xmm11,%xmm9 - por %xmm12,%xmm8 - por %xmm13,%xmm9 - por %xmm14,%xmm8 - por %xmm15,%xmm9 - - por %xmm9,%xmm8 - pshufd $0x4e,%xmm8,%xmm9 - por %xmm9,%xmm8 -.byte 102,76,15,126,195 - - movq %r8,128(%rsp) - movq %rdi,128+8(%rsp) - movq %rcx,128+16(%rsp) - - movq (%rsi),%rax - movq 8(%rsi),%rcx - mulq %rbx - movq %rax,(%rsp) - movq %rcx,%rax - movq %rdx,%r8 - - mulq %rbx - addq %rax,%r8 - movq 16(%rsi),%rax - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r9 - movq 24(%rsi),%rax - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r10 - movq 32(%rsi),%rax - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r11 - movq 40(%rsi),%rax - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r12 - movq 48(%rsi),%rax - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r13 - movq 56(%rsi),%rax - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - addq %rax,%r14 - movq (%rsi),%rax - movq %rdx,%r15 - adcq $0,%r15 - - leaq 8(%rsp),%rdi - movl $7,%ecx - jmp L$oop_mul_gather - -.p2align 5 -L$oop_mul_gather: - movdqa 0(%rbp),%xmm8 - movdqa 16(%rbp),%xmm9 - movdqa 32(%rbp),%xmm10 - movdqa 48(%rbp),%xmm11 - pand %xmm0,%xmm8 - movdqa 64(%rbp),%xmm12 - pand %xmm1,%xmm9 - movdqa 80(%rbp),%xmm13 - pand %xmm2,%xmm10 - movdqa 96(%rbp),%xmm14 - pand %xmm3,%xmm11 - movdqa 112(%rbp),%xmm15 - leaq 128(%rbp),%rbp - pand %xmm4,%xmm12 - pand %xmm5,%xmm13 - pand %xmm6,%xmm14 - pand %xmm7,%xmm15 - por %xmm10,%xmm8 - por %xmm11,%xmm9 - por %xmm12,%xmm8 - por %xmm13,%xmm9 - por %xmm14,%xmm8 - por %xmm15,%xmm9 - - por %xmm9,%xmm8 - pshufd $0x4e,%xmm8,%xmm9 - por %xmm9,%xmm8 -.byte 102,76,15,126,195 - - mulq %rbx - addq %rax,%r8 - movq 8(%rsi),%rax - movq %r8,(%rdi) - movq %rdx,%r8 - adcq $0,%r8 - - mulq %rbx - addq %rax,%r9 - movq 16(%rsi),%rax - adcq $0,%rdx - addq %r9,%r8 - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r10 - movq 24(%rsi),%rax - adcq $0,%rdx - addq %r10,%r9 - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r11 - movq 32(%rsi),%rax - adcq $0,%rdx - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r12 - movq 40(%rsi),%rax - adcq $0,%rdx - addq %r12,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r13 - movq 48(%rsi),%rax - adcq $0,%rdx - addq %r13,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r14 - movq 56(%rsi),%rax - adcq $0,%rdx - addq %r14,%r13 - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - addq %rax,%r15 - movq (%rsi),%rax - adcq $0,%rdx - addq %r15,%r14 - movq %rdx,%r15 - adcq $0,%r15 - - leaq 8(%rdi),%rdi - - decl %ecx - jnz L$oop_mul_gather - - movq %r8,(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - movq 128+8(%rsp),%rdi - movq 128+16(%rsp),%rbp - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reduce - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - leaq 128+24+48(%rsp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$mul_gather4_epilogue: - .byte 0xf3,0xc3 - -.globl _rsaz_512_mul_scatter4 -.private_extern _rsaz_512_mul_scatter4 - -.p2align 5 -_rsaz_512_mul_scatter4: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - movl %r9d,%r9d - subq $128+24,%rsp -L$mul_scatter4_body: - leaq (%r8,%r9,8),%r8 -.byte 102,72,15,110,199 -.byte 102,72,15,110,202 -.byte 102,73,15,110,208 - movq %rcx,128(%rsp) - - movq %rdi,%rbp - movq (%rdi),%rbx - call __rsaz_512_mul - -.byte 102,72,15,126,199 -.byte 102,72,15,126,205 - - movq (%rsp),%r8 - movq 8(%rsp),%r9 - movq 16(%rsp),%r10 - movq 24(%rsp),%r11 - movq 32(%rsp),%r12 - movq 40(%rsp),%r13 - movq 48(%rsp),%r14 - movq 56(%rsp),%r15 - - call __rsaz_512_reduce - addq 64(%rsp),%r8 - adcq 72(%rsp),%r9 - adcq 80(%rsp),%r10 - adcq 88(%rsp),%r11 - adcq 96(%rsp),%r12 - adcq 104(%rsp),%r13 - adcq 112(%rsp),%r14 - adcq 120(%rsp),%r15 -.byte 102,72,15,126,214 - sbbq %rcx,%rcx - - call __rsaz_512_subtract - - movq %r8,0(%rsi) - movq %r9,128(%rsi) - movq %r10,256(%rsi) - movq %r11,384(%rsi) - movq %r12,512(%rsi) - movq %r13,640(%rsi) - movq %r14,768(%rsi) - movq %r15,896(%rsi) - - leaq 128+24+48(%rsp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$mul_scatter4_epilogue: - .byte 0xf3,0xc3 - -.globl _rsaz_512_mul_by_one -.private_extern _rsaz_512_mul_by_one - -.p2align 5 -_rsaz_512_mul_by_one: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $128+24,%rsp -L$mul_by_one_body: - movq %rdx,%rbp - movq %rcx,128(%rsp) - - movq (%rsi),%r8 - pxor %xmm0,%xmm0 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq 32(%rsi),%r12 - movq 40(%rsi),%r13 - movq 48(%rsi),%r14 - movq 56(%rsi),%r15 - - movdqa %xmm0,(%rsp) - movdqa %xmm0,16(%rsp) - movdqa %xmm0,32(%rsp) - movdqa %xmm0,48(%rsp) - movdqa %xmm0,64(%rsp) - movdqa %xmm0,80(%rsp) - movdqa %xmm0,96(%rsp) - call __rsaz_512_reduce - movq %r8,(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - leaq 128+24+48(%rsp),%rax - movq -48(%rax),%r15 - movq -40(%rax),%r14 - movq -32(%rax),%r13 - movq -24(%rax),%r12 - movq -16(%rax),%rbp - movq -8(%rax),%rbx - leaq (%rax),%rsp -L$mul_by_one_epilogue: - .byte 0xf3,0xc3 - - -.p2align 5 -__rsaz_512_reduce: - movq %r8,%rbx - imulq 128+8(%rsp),%rbx - movq 0(%rbp),%rax - movl $8,%ecx - jmp L$reduction_loop - -.p2align 5 -L$reduction_loop: - mulq %rbx - movq 8(%rbp),%rax - negq %r8 - movq %rdx,%r8 - adcq $0,%r8 - - mulq %rbx - addq %rax,%r9 - movq 16(%rbp),%rax - adcq $0,%rdx - addq %r9,%r8 - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r10 - movq 24(%rbp),%rax - adcq $0,%rdx - addq %r10,%r9 - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r11 - movq 32(%rbp),%rax - adcq $0,%rdx - addq %r11,%r10 - movq 128+8(%rsp),%rsi - - - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbx - addq %rax,%r12 - movq 40(%rbp),%rax - adcq $0,%rdx - imulq %r8,%rsi - addq %r12,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r13 - movq 48(%rbp),%rax - adcq $0,%rdx - addq %r13,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r14 - movq 56(%rbp),%rax - adcq $0,%rdx - addq %r14,%r13 - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - movq %rsi,%rbx - addq %rax,%r15 - movq 0(%rbp),%rax - adcq $0,%rdx - addq %r15,%r14 - movq %rdx,%r15 - adcq $0,%r15 - - decl %ecx - jne L$reduction_loop - - .byte 0xf3,0xc3 - - -.p2align 5 -__rsaz_512_subtract: - movq %r8,(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - movq 0(%rbp),%r8 - movq 8(%rbp),%r9 - negq %r8 - notq %r9 - andq %rcx,%r8 - movq 16(%rbp),%r10 - andq %rcx,%r9 - notq %r10 - movq 24(%rbp),%r11 - andq %rcx,%r10 - notq %r11 - movq 32(%rbp),%r12 - andq %rcx,%r11 - notq %r12 - movq 40(%rbp),%r13 - andq %rcx,%r12 - notq %r13 - movq 48(%rbp),%r14 - andq %rcx,%r13 - notq %r14 - movq 56(%rbp),%r15 - andq %rcx,%r14 - notq %r15 - andq %rcx,%r15 - - addq (%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - - movq %r8,(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - .byte 0xf3,0xc3 - - -.p2align 5 -__rsaz_512_mul: - leaq 8(%rsp),%rdi - - movq (%rsi),%rax - mulq %rbx - movq %rax,(%rdi) - movq 8(%rsi),%rax - movq %rdx,%r8 - - mulq %rbx - addq %rax,%r8 - movq 16(%rsi),%rax - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r9 - movq 24(%rsi),%rax - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r10 - movq 32(%rsi),%rax - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r11 - movq 40(%rsi),%rax - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r12 - movq 48(%rsi),%rax - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r13 - movq 56(%rsi),%rax - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - addq %rax,%r14 - movq (%rsi),%rax - movq %rdx,%r15 - adcq $0,%r15 - - leaq 8(%rbp),%rbp - leaq 8(%rdi),%rdi - - movl $7,%ecx - jmp L$oop_mul - -.p2align 5 -L$oop_mul: - movq (%rbp),%rbx - mulq %rbx - addq %rax,%r8 - movq 8(%rsi),%rax - movq %r8,(%rdi) - movq %rdx,%r8 - adcq $0,%r8 - - mulq %rbx - addq %rax,%r9 - movq 16(%rsi),%rax - adcq $0,%rdx - addq %r9,%r8 - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r10 - movq 24(%rsi),%rax - adcq $0,%rdx - addq %r10,%r9 - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r11 - movq 32(%rsi),%rax - adcq $0,%rdx - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r12 - movq 40(%rsi),%rax - adcq $0,%rdx - addq %r12,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r13 - movq 48(%rsi),%rax - adcq $0,%rdx - addq %r13,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r14 - movq 56(%rsi),%rax - adcq $0,%rdx - addq %r14,%r13 - movq %rdx,%r14 - leaq 8(%rbp),%rbp - adcq $0,%r14 - - mulq %rbx - addq %rax,%r15 - movq (%rsi),%rax - adcq $0,%rdx - addq %r15,%r14 - movq %rdx,%r15 - adcq $0,%r15 - - leaq 8(%rdi),%rdi - - decl %ecx - jnz L$oop_mul - - movq %r8,(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - .byte 0xf3,0xc3 - -.globl _rsaz_512_scatter4 -.private_extern _rsaz_512_scatter4 - -.p2align 4 -_rsaz_512_scatter4: - leaq (%rdi,%rdx,8),%rdi - movl $8,%r9d - jmp L$oop_scatter -.p2align 4 -L$oop_scatter: - movq (%rsi),%rax - leaq 8(%rsi),%rsi - movq %rax,(%rdi) - leaq 128(%rdi),%rdi - decl %r9d - jnz L$oop_scatter - .byte 0xf3,0xc3 - - -.globl _rsaz_512_gather4 -.private_extern _rsaz_512_gather4 - -.p2align 4 -_rsaz_512_gather4: - movd %edx,%xmm8 - movdqa L$inc+16(%rip),%xmm1 - movdqa L$inc(%rip),%xmm0 - - pshufd $0,%xmm8,%xmm8 - movdqa %xmm1,%xmm7 - movdqa %xmm1,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm8,%xmm0 - movdqa %xmm7,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm8,%xmm1 - movdqa %xmm7,%xmm4 - paddd %xmm2,%xmm3 - pcmpeqd %xmm8,%xmm2 - movdqa %xmm7,%xmm5 - paddd %xmm3,%xmm4 - pcmpeqd %xmm8,%xmm3 - movdqa %xmm7,%xmm6 - paddd %xmm4,%xmm5 - pcmpeqd %xmm8,%xmm4 - paddd %xmm5,%xmm6 - pcmpeqd %xmm8,%xmm5 - paddd %xmm6,%xmm7 - pcmpeqd %xmm8,%xmm6 - pcmpeqd %xmm8,%xmm7 - movl $8,%r9d - jmp L$oop_gather -.p2align 4 -L$oop_gather: - movdqa 0(%rsi),%xmm8 - movdqa 16(%rsi),%xmm9 - movdqa 32(%rsi),%xmm10 - movdqa 48(%rsi),%xmm11 - pand %xmm0,%xmm8 - movdqa 64(%rsi),%xmm12 - pand %xmm1,%xmm9 - movdqa 80(%rsi),%xmm13 - pand %xmm2,%xmm10 - movdqa 96(%rsi),%xmm14 - pand %xmm3,%xmm11 - movdqa 112(%rsi),%xmm15 - leaq 128(%rsi),%rsi - pand %xmm4,%xmm12 - pand %xmm5,%xmm13 - pand %xmm6,%xmm14 - pand %xmm7,%xmm15 - por %xmm10,%xmm8 - por %xmm11,%xmm9 - por %xmm12,%xmm8 - por %xmm13,%xmm9 - por %xmm14,%xmm8 - por %xmm15,%xmm9 - - por %xmm9,%xmm8 - pshufd $0x4e,%xmm8,%xmm9 - por %xmm9,%xmm8 - movq %xmm8,(%rdi) - leaq 8(%rdi),%rdi - decl %r9d - jnz L$oop_gather - .byte 0xf3,0xc3 -L$SEH_end_rsaz_512_gather4: - - -.p2align 6 -L$inc: -.long 0,0, 1,1 -.long 2,2, 2,2 -#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S index c3554c8d13..30edc7b5e2 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/chacha/chacha-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text @@ -22,6 +22,15 @@ L$rot24: .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe L$sigma: .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 +.p2align 6 +L$zeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +L$fourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +L$incz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +L$sixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .globl _ChaCha20_ctr32 .private_extern _ChaCha20_ctr32 @@ -41,6 +50,7 @@ _ChaCha20_ctr32: pushq %r14 pushq %r15 subq $64+24,%rsp +L$ctr32_body: movdqu (%rcx),%xmm1 @@ -278,13 +288,14 @@ L$oop_tail: jnz L$oop_tail L$done: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq 64+24+48(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$no_data: .byte 0xf3,0xc3 @@ -292,18 +303,12 @@ L$no_data: .p2align 5 ChaCha20_ssse3: L$ChaCha20_ssse3: + movq %rsp,%r9 cmpq $128,%rdx ja L$ChaCha20_4x L$do_sse3_after_all: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - - subq $64+24,%rsp + subq $64+8,%rsp movdqa L$sigma(%rip),%xmm0 movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 @@ -315,7 +320,7 @@ L$do_sse3_after_all: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - movl $10,%ebp + movq $10,%r8 jmp L$oop_ssse3 .p2align 5 @@ -325,7 +330,7 @@ L$oop_outer_ssse3: movdqa 16(%rsp),%xmm1 movdqa 32(%rsp),%xmm2 paddd 48(%rsp),%xmm3 - movl $10,%ebp + movq $10,%r8 movdqa %xmm3,48(%rsp) jmp L$oop_ssse3 @@ -374,7 +379,7 @@ L$oop_ssse3: pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm1,%xmm1 pshufd $57,%xmm3,%xmm3 - decl %ebp + decq %r8 jnz L$oop_ssse3 paddd 0(%rsp),%xmm0 paddd 16(%rsp),%xmm1 @@ -411,31 +416,27 @@ L$tail_ssse3: movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) - xorq %rbx,%rbx + xorq %r8,%r8 L$oop_tail_ssse3: - movzbl (%rsi,%rbx,1),%eax - movzbl (%rsp,%rbx,1),%ecx - leaq 1(%rbx),%rbx + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 xorl %ecx,%eax - movb %al,-1(%rdi,%rbx,1) + movb %al,-1(%rdi,%r8,1) decq %rdx jnz L$oop_tail_ssse3 L$done_ssse3: - addq $64+24,%rsp - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx + leaq (%r9),%rsp +L$ssse3_epilogue: .byte 0xf3,0xc3 .p2align 5 ChaCha20_4x: L$ChaCha20_4x: + movq %rsp,%r9 movq %r10,%r11 shrq $32,%r10 testq $32,%r10 @@ -448,8 +449,7 @@ L$ChaCha20_4x: je L$do_sse3_after_all L$proceed4x: - leaq -120(%rsp),%r11 - subq $0x148+0,%rsp + subq $0x140+8,%rsp movdqa L$sigma(%rip),%xmm11 movdqu (%rcx),%xmm15 movdqu 16(%rcx),%xmm7 @@ -976,18 +976,18 @@ L$oop_tail4x: jnz L$oop_tail4x L$done4x: - addq $0x148+0,%rsp + leaq (%r9),%rsp +L$4x_epilogue: .byte 0xf3,0xc3 .p2align 5 ChaCha20_8x: L$ChaCha20_8x: - movq %rsp,%r10 + movq %rsp,%r9 subq $0x280+8,%rsp andq $-32,%rsp vzeroupper - movq %r10,640(%rsp) @@ -1578,7 +1578,8 @@ L$oop_tail8x: L$done8x: vzeroall - movq 640(%rsp),%rsp + leaq (%r9),%rsp +L$8x_epilogue: .byte 0xf3,0xc3 #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S new file mode 100644 index 0000000000..c8a5262c8d --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S @@ -0,0 +1,3056 @@ +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +.data + +.p2align 4 +one: +.quad 1,0 +two: +.quad 2,0 +three: +.quad 3,0 +four: +.quad 4,0 +five: +.quad 5,0 +six: +.quad 6,0 +seven: +.quad 7,0 +eight: +.quad 8,0 + +OR_MASK: +.long 0x00000000,0x00000000,0x00000000,0x80000000 +poly: +.quad 0x1, 0xc200000000000000 +mask: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +con1: +.long 1,1,1,1 +con2: +.long 0x1b,0x1b,0x1b,0x1b +con3: +.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 +and_mask: +.long 0,0xffffffff, 0xffffffff, 0xffffffff +.text + +.p2align 4 +GFMUL: + + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm3,%xmm5,%xmm5 + + vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 + vpshufd $78,%xmm2,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + + vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 + vpshufd $78,%xmm2,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + + vpxor %xmm5,%xmm2,%xmm0 + .byte 0xf3,0xc3 + + +.globl _aesgcmsiv_htable_init +.private_extern _aesgcmsiv_htable_init + +.p2align 4 +_aesgcmsiv_htable_init: + + vmovdqa (%rsi),%xmm0 + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm0,(%rdi) + call GFMUL + vmovdqa %xmm0,16(%rdi) + call GFMUL + vmovdqa %xmm0,32(%rdi) + call GFMUL + vmovdqa %xmm0,48(%rdi) + call GFMUL + vmovdqa %xmm0,64(%rdi) + call GFMUL + vmovdqa %xmm0,80(%rdi) + call GFMUL + vmovdqa %xmm0,96(%rdi) + call GFMUL + vmovdqa %xmm0,112(%rdi) + .byte 0xf3,0xc3 + + +.globl _aesgcmsiv_htable6_init +.private_extern _aesgcmsiv_htable6_init + +.p2align 4 +_aesgcmsiv_htable6_init: + + vmovdqa (%rsi),%xmm0 + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm0,(%rdi) + call GFMUL + vmovdqa %xmm0,16(%rdi) + call GFMUL + vmovdqa %xmm0,32(%rdi) + call GFMUL + vmovdqa %xmm0,48(%rdi) + call GFMUL + vmovdqa %xmm0,64(%rdi) + call GFMUL + vmovdqa %xmm0,80(%rdi) + .byte 0xf3,0xc3 + + +.globl _aesgcmsiv_htable_polyval +.private_extern _aesgcmsiv_htable_polyval + +.p2align 4 +_aesgcmsiv_htable_polyval: + + testq %rdx,%rdx + jnz L$htable_polyval_start + .byte 0xf3,0xc3 + +L$htable_polyval_start: + vzeroall + + + + movq %rdx,%r11 + andq $127,%r11 + + jz L$htable_polyval_no_prefix + + vpxor %xmm9,%xmm9,%xmm9 + vmovdqa (%rcx),%xmm1 + subq %r11,%rdx + + subq $16,%r11 + + + vmovdqu (%rsi),%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5 + vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3 + vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4 + vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + leaq 16(%rsi),%rsi + testq %r11,%r11 + jnz L$htable_polyval_prefix_loop + jmp L$htable_polyval_prefix_complete + + +.p2align 6 +L$htable_polyval_prefix_loop: + subq $16,%r11 + + vmovdqu (%rsi),%xmm0 + + vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + testq %r11,%r11 + + leaq 16(%rsi),%rsi + + jnz L$htable_polyval_prefix_loop + +L$htable_polyval_prefix_complete: + vpsrldq $8,%xmm5,%xmm6 + vpslldq $8,%xmm5,%xmm5 + + vpxor %xmm6,%xmm4,%xmm9 + vpxor %xmm5,%xmm3,%xmm1 + + jmp L$htable_polyval_main_loop + +L$htable_polyval_no_prefix: + + + + + vpxor %xmm1,%xmm1,%xmm1 + vmovdqa (%rcx),%xmm9 + +.p2align 6 +L$htable_polyval_main_loop: + subq $0x80,%rdx + jb L$htable_polyval_out + + vmovdqu 112(%rsi),%xmm0 + + vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5 + vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3 + vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4 + vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 96(%rsi),%xmm0 + vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + + vmovdqu 80(%rsi),%xmm0 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 + vpalignr $8,%xmm1,%xmm1,%xmm1 + + vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm7,%xmm1,%xmm1 + + vmovdqu 64(%rsi),%xmm0 + + vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 48(%rsi),%xmm0 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 + vpalignr $8,%xmm1,%xmm1,%xmm1 + + vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm7,%xmm1,%xmm1 + + vmovdqu 32(%rsi),%xmm0 + + vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm9,%xmm1,%xmm1 + + vmovdqu 16(%rsi),%xmm0 + + vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 0(%rsi),%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpsrldq $8,%xmm5,%xmm6 + vpslldq $8,%xmm5,%xmm5 + + vpxor %xmm6,%xmm4,%xmm9 + vpxor %xmm5,%xmm3,%xmm1 + + leaq 128(%rsi),%rsi + jmp L$htable_polyval_main_loop + + + +L$htable_polyval_out: + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 + vpalignr $8,%xmm1,%xmm1,%xmm1 + vpxor %xmm6,%xmm1,%xmm1 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 + vpalignr $8,%xmm1,%xmm1,%xmm1 + vpxor %xmm6,%xmm1,%xmm1 + vpxor %xmm9,%xmm1,%xmm1 + + vmovdqu %xmm1,(%rcx) + vzeroupper + .byte 0xf3,0xc3 + + +.globl _aesgcmsiv_polyval_horner +.private_extern _aesgcmsiv_polyval_horner + +.p2align 4 +_aesgcmsiv_polyval_horner: + + testq %rcx,%rcx + jnz L$polyval_horner_start + .byte 0xf3,0xc3 + +L$polyval_horner_start: + + + + xorq %r10,%r10 + shlq $4,%rcx + + vmovdqa (%rsi),%xmm1 + vmovdqa (%rdi),%xmm0 + +L$polyval_horner_loop: + vpxor (%rdx,%r10,1),%xmm0,%xmm0 + call GFMUL + + addq $16,%r10 + cmpq %r10,%rcx + jne L$polyval_horner_loop + + + vmovdqa %xmm0,(%rdi) + .byte 0xf3,0xc3 + + +.globl _aes128gcmsiv_aes_ks +.private_extern _aes128gcmsiv_aes_ks + +.p2align 4 +_aes128gcmsiv_aes_ks: + + vmovdqu (%rdi),%xmm1 + vmovdqa %xmm1,(%rsi) + + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + + movq $8,%rax + +L$ks128_loop: + addq $16,%rsi + subq $1,%rax + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + jne L$ks128_loop + + vmovdqa con2(%rip),%xmm0 + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,16(%rsi) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,32(%rsi) + .byte 0xf3,0xc3 + + +.globl _aes256gcmsiv_aes_ks +.private_extern _aes256gcmsiv_aes_ks + +.p2align 4 +_aes256gcmsiv_aes_ks: + + vmovdqu (%rdi),%xmm1 + vmovdqu 16(%rdi),%xmm3 + vmovdqa %xmm1,(%rsi) + vmovdqa %xmm3,16(%rsi) + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + vpxor %xmm14,%xmm14,%xmm14 + movq $6,%rax + +L$ks256_loop: + addq $32,%rsi + subq $1,%rax + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpshufb con3(%rip),%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vmovdqa %xmm3,16(%rsi) + jne L$ks256_loop + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpsllq $32,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,32(%rsi) + .byte 0xf3,0xc3 + +.globl _aes128gcmsiv_aes_ks_enc_x1 +.private_extern _aes128gcmsiv_aes_ks_enc_x1 + +.p2align 4 +_aes128gcmsiv_aes_ks_enc_x1: + + vmovdqa (%rcx),%xmm1 + vmovdqa 0(%rdi),%xmm4 + + vmovdqa %xmm1,(%rdx) + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,16(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,32(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,48(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,64(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,80(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,96(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,112(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,128(%rdx) + + + vmovdqa con2(%rip),%xmm0 + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,144(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenclast %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,160(%rdx) + + + vmovdqa %xmm4,0(%rsi) + .byte 0xf3,0xc3 + + +.globl _aes128gcmsiv_kdf +.private_extern _aes128gcmsiv_kdf + +.p2align 4 +_aes128gcmsiv_kdf: + + + + + + vmovdqa (%rdx),%xmm1 + vmovdqa 0(%rdi),%xmm9 + vmovdqa and_mask(%rip),%xmm12 + vmovdqa one(%rip),%xmm13 + vpshufd $0x90,%xmm9,%xmm9 + vpand %xmm12,%xmm9,%xmm9 + vpaddd %xmm13,%xmm9,%xmm10 + vpaddd %xmm13,%xmm10,%xmm11 + vpaddd %xmm13,%xmm11,%xmm12 + + vpxor %xmm1,%xmm9,%xmm9 + vpxor %xmm1,%xmm10,%xmm10 + vpxor %xmm1,%xmm11,%xmm11 + vpxor %xmm1,%xmm12,%xmm12 + + vmovdqa 16(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 32(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 48(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 64(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 80(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 96(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 112(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 128(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 144(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 160(%rdx),%xmm2 + vaesenclast %xmm2,%xmm9,%xmm9 + vaesenclast %xmm2,%xmm10,%xmm10 + vaesenclast %xmm2,%xmm11,%xmm11 + vaesenclast %xmm2,%xmm12,%xmm12 + + + vmovdqa %xmm9,0(%rsi) + vmovdqa %xmm10,16(%rsi) + vmovdqa %xmm11,32(%rsi) + vmovdqa %xmm12,48(%rsi) + .byte 0xf3,0xc3 + + +.globl _aes128gcmsiv_enc_msg_x4 +.private_extern _aes128gcmsiv_enc_msg_x4 + +.p2align 4 +_aes128gcmsiv_enc_msg_x4: + + testq %r8,%r8 + jnz L$128_enc_msg_x4_start + .byte 0xf3,0xc3 + +L$128_enc_msg_x4_start: + pushq %r12 + + pushq %r13 + + + shrq $4,%r8 + movq %r8,%r10 + shlq $62,%r10 + shrq $62,%r10 + + + vmovdqa (%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + + vmovdqu four(%rip),%xmm4 + vmovdqa %xmm15,%xmm0 + vpaddd one(%rip),%xmm15,%xmm1 + vpaddd two(%rip),%xmm15,%xmm2 + vpaddd three(%rip),%xmm15,%xmm3 + + shrq $2,%r8 + je L$128_enc_msg_x4_check_remainder + + subq $64,%rsi + subq $64,%rdi + +L$128_enc_msg_x4_loop1: + addq $64,%rsi + addq $64,%rdi + + vmovdqa %xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vmovdqa %xmm2,%xmm7 + vmovdqa %xmm3,%xmm8 + + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqu 32(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm1,%xmm1 + vmovdqu 48(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm2,%xmm2 + vmovdqu 64(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm3,%xmm3 + + vmovdqu 80(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 96(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 112(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 128(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 144(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm12 + vaesenclast %xmm12,%xmm5,%xmm5 + vaesenclast %xmm12,%xmm6,%xmm6 + vaesenclast %xmm12,%xmm7,%xmm7 + vaesenclast %xmm12,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm5,%xmm5 + vpxor 16(%rdi),%xmm6,%xmm6 + vpxor 32(%rdi),%xmm7,%xmm7 + vpxor 48(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm5,0(%rsi) + vmovdqu %xmm6,16(%rsi) + vmovdqu %xmm7,32(%rsi) + vmovdqu %xmm8,48(%rsi) + + jne L$128_enc_msg_x4_loop1 + + addq $64,%rsi + addq $64,%rdi + +L$128_enc_msg_x4_check_remainder: + cmpq $0,%r10 + je L$128_enc_msg_x4_out + +L$128_enc_msg_x4_loop2: + + + vmovdqa %xmm0,%xmm5 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm5,%xmm5 + vaesenc 16(%rcx),%xmm5,%xmm5 + vaesenc 32(%rcx),%xmm5,%xmm5 + vaesenc 48(%rcx),%xmm5,%xmm5 + vaesenc 64(%rcx),%xmm5,%xmm5 + vaesenc 80(%rcx),%xmm5,%xmm5 + vaesenc 96(%rcx),%xmm5,%xmm5 + vaesenc 112(%rcx),%xmm5,%xmm5 + vaesenc 128(%rcx),%xmm5,%xmm5 + vaesenc 144(%rcx),%xmm5,%xmm5 + vaesenclast 160(%rcx),%xmm5,%xmm5 + + + vpxor (%rdi),%xmm5,%xmm5 + vmovdqu %xmm5,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + subq $1,%r10 + jne L$128_enc_msg_x4_loop2 + +L$128_enc_msg_x4_out: + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + + +.globl _aes128gcmsiv_enc_msg_x8 +.private_extern _aes128gcmsiv_enc_msg_x8 + +.p2align 4 +_aes128gcmsiv_enc_msg_x8: + + testq %r8,%r8 + jnz L$128_enc_msg_x8_start + .byte 0xf3,0xc3 + +L$128_enc_msg_x8_start: + pushq %r12 + + pushq %r13 + + pushq %rbp + + movq %rsp,%rbp + + + + subq $128,%rsp + andq $-64,%rsp + + shrq $4,%r8 + movq %r8,%r10 + shlq $61,%r10 + shrq $61,%r10 + + + vmovdqu (%rdx),%xmm1 + vpor OR_MASK(%rip),%xmm1,%xmm1 + + + vpaddd seven(%rip),%xmm1,%xmm0 + vmovdqu %xmm0,(%rsp) + vpaddd one(%rip),%xmm1,%xmm9 + vpaddd two(%rip),%xmm1,%xmm10 + vpaddd three(%rip),%xmm1,%xmm11 + vpaddd four(%rip),%xmm1,%xmm12 + vpaddd five(%rip),%xmm1,%xmm13 + vpaddd six(%rip),%xmm1,%xmm14 + vmovdqa %xmm1,%xmm0 + + shrq $3,%r8 + je L$128_enc_msg_x8_check_remainder + + subq $128,%rsi + subq $128,%rdi + +L$128_enc_msg_x8_loop1: + addq $128,%rsi + addq $128,%rdi + + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm9,%xmm2 + vmovdqa %xmm10,%xmm3 + vmovdqa %xmm11,%xmm4 + vmovdqa %xmm12,%xmm5 + vmovdqa %xmm13,%xmm6 + vmovdqa %xmm14,%xmm7 + + vmovdqu (%rsp),%xmm8 + + vpxor (%rcx),%xmm1,%xmm1 + vpxor (%rcx),%xmm2,%xmm2 + vpxor (%rcx),%xmm3,%xmm3 + vpxor (%rcx),%xmm4,%xmm4 + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu (%rsp),%xmm14 + vpaddd eight(%rip),%xmm14,%xmm14 + vmovdqu %xmm14,(%rsp) + vmovdqu 32(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpsubd one(%rip),%xmm14,%xmm14 + vmovdqu 48(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm0,%xmm0 + vmovdqu 64(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm9,%xmm9 + vmovdqu 80(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm10,%xmm10 + vmovdqu 96(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm11,%xmm11 + vmovdqu 112(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm12,%xmm12 + vmovdqu 128(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm13,%xmm13 + vmovdqu 144(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm15 + vaesenclast %xmm15,%xmm1,%xmm1 + vaesenclast %xmm15,%xmm2,%xmm2 + vaesenclast %xmm15,%xmm3,%xmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vaesenclast %xmm15,%xmm6,%xmm6 + vaesenclast %xmm15,%xmm7,%xmm7 + vaesenclast %xmm15,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm1,%xmm1 + vpxor 16(%rdi),%xmm2,%xmm2 + vpxor 32(%rdi),%xmm3,%xmm3 + vpxor 48(%rdi),%xmm4,%xmm4 + vpxor 64(%rdi),%xmm5,%xmm5 + vpxor 80(%rdi),%xmm6,%xmm6 + vpxor 96(%rdi),%xmm7,%xmm7 + vpxor 112(%rdi),%xmm8,%xmm8 + + decq %r8 + + vmovdqu %xmm1,0(%rsi) + vmovdqu %xmm2,16(%rsi) + vmovdqu %xmm3,32(%rsi) + vmovdqu %xmm4,48(%rsi) + vmovdqu %xmm5,64(%rsi) + vmovdqu %xmm6,80(%rsi) + vmovdqu %xmm7,96(%rsi) + vmovdqu %xmm8,112(%rsi) + + jne L$128_enc_msg_x8_loop1 + + addq $128,%rsi + addq $128,%rdi + +L$128_enc_msg_x8_check_remainder: + cmpq $0,%r10 + je L$128_enc_msg_x8_out + +L$128_enc_msg_x8_loop2: + + + vmovdqa %xmm0,%xmm1 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm1,%xmm1 + vaesenc 16(%rcx),%xmm1,%xmm1 + vaesenc 32(%rcx),%xmm1,%xmm1 + vaesenc 48(%rcx),%xmm1,%xmm1 + vaesenc 64(%rcx),%xmm1,%xmm1 + vaesenc 80(%rcx),%xmm1,%xmm1 + vaesenc 96(%rcx),%xmm1,%xmm1 + vaesenc 112(%rcx),%xmm1,%xmm1 + vaesenc 128(%rcx),%xmm1,%xmm1 + vaesenc 144(%rcx),%xmm1,%xmm1 + vaesenclast 160(%rcx),%xmm1,%xmm1 + + + vpxor (%rdi),%xmm1,%xmm1 + + vmovdqu %xmm1,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + decq %r10 + jne L$128_enc_msg_x8_loop2 + +L$128_enc_msg_x8_out: + movq %rbp,%rsp + + popq %rbp + + popq %r13 + + popq %r12 + + .byte 0xf3,0xc3 + + +.globl _aes128gcmsiv_dec +.private_extern _aes128gcmsiv_dec + +.p2align 4 +_aes128gcmsiv_dec: + + testq $~15,%r9 + jnz L$128_dec_start + .byte 0xf3,0xc3 + +L$128_dec_start: + vzeroupper + vmovdqa (%rdx),%xmm0 + movq %rdx,%rax + + leaq 32(%rax),%rax + leaq 32(%rcx),%rcx + + + vmovdqu (%rdi,%r9,1),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + andq $~15,%r9 + + + cmpq $96,%r9 + jb L$128_dec_loop2 + + + subq $96,%r9 + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vpxor (%r8),%xmm7,%xmm7 + vpxor (%r8),%xmm8,%xmm8 + vpxor (%r8),%xmm9,%xmm9 + vpxor (%r8),%xmm10,%xmm10 + vpxor (%r8),%xmm11,%xmm11 + vpxor (%r8),%xmm12,%xmm12 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vaesenclast %xmm4,%xmm8,%xmm8 + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm4,%xmm10,%xmm10 + vaesenclast %xmm4,%xmm11,%xmm11 + vaesenclast %xmm4,%xmm12,%xmm12 + + + vpxor 0(%rdi),%xmm7,%xmm7 + vpxor 16(%rdi),%xmm8,%xmm8 + vpxor 32(%rdi),%xmm9,%xmm9 + vpxor 48(%rdi),%xmm10,%xmm10 + vpxor 64(%rdi),%xmm11,%xmm11 + vpxor 80(%rdi),%xmm12,%xmm12 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + addq $96,%rdi + addq $96,%rsi + jmp L$128_dec_loop1 + + +.p2align 6 +L$128_dec_loop1: + cmpq $96,%r9 + jb L$128_dec_finish_96 + subq $96,%r9 + + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vmovdqa (%r8),%xmm4 + vpxor %xmm4,%xmm7,%xmm7 + vpxor %xmm4,%xmm8,%xmm8 + vpxor %xmm4,%xmm9,%xmm9 + vpxor %xmm4,%xmm10,%xmm10 + vpxor %xmm4,%xmm11,%xmm11 + vpxor %xmm4,%xmm12,%xmm12 + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vmovdqa 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm6 + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor 0(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vpxor 16(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm8,%xmm8 + vpxor 32(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm9,%xmm9 + vpxor 48(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm10,%xmm10 + vpxor 64(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm11,%xmm11 + vpxor 80(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm12,%xmm12 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + vpxor %xmm5,%xmm0,%xmm0 + + leaq 96(%rdi),%rdi + leaq 96(%rsi),%rsi + jmp L$128_dec_loop1 + +L$128_dec_finish_96: + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor %xmm5,%xmm0,%xmm0 + +L$128_dec_loop2: + + + + cmpq $16,%r9 + jb L$128_dec_out + subq $16,%r9 + + vmovdqa %xmm15,%xmm2 + vpaddd one(%rip),%xmm15,%xmm15 + + vpxor 0(%r8),%xmm2,%xmm2 + vaesenc 16(%r8),%xmm2,%xmm2 + vaesenc 32(%r8),%xmm2,%xmm2 + vaesenc 48(%r8),%xmm2,%xmm2 + vaesenc 64(%r8),%xmm2,%xmm2 + vaesenc 80(%r8),%xmm2,%xmm2 + vaesenc 96(%r8),%xmm2,%xmm2 + vaesenc 112(%r8),%xmm2,%xmm2 + vaesenc 128(%r8),%xmm2,%xmm2 + vaesenc 144(%r8),%xmm2,%xmm2 + vaesenclast 160(%r8),%xmm2,%xmm2 + vpxor (%rdi),%xmm2,%xmm2 + vmovdqu %xmm2,(%rsi) + addq $16,%rdi + addq $16,%rsi + + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa -32(%rcx),%xmm1 + call GFMUL + + jmp L$128_dec_loop2 + +L$128_dec_out: + vmovdqu %xmm0,(%rdx) + .byte 0xf3,0xc3 + + +.globl _aes128gcmsiv_ecb_enc_block +.private_extern _aes128gcmsiv_ecb_enc_block + +.p2align 4 +_aes128gcmsiv_ecb_enc_block: + + vmovdqa (%rdi),%xmm1 + + vpxor (%rdx),%xmm1,%xmm1 + vaesenc 16(%rdx),%xmm1,%xmm1 + vaesenc 32(%rdx),%xmm1,%xmm1 + vaesenc 48(%rdx),%xmm1,%xmm1 + vaesenc 64(%rdx),%xmm1,%xmm1 + vaesenc 80(%rdx),%xmm1,%xmm1 + vaesenc 96(%rdx),%xmm1,%xmm1 + vaesenc 112(%rdx),%xmm1,%xmm1 + vaesenc 128(%rdx),%xmm1,%xmm1 + vaesenc 144(%rdx),%xmm1,%xmm1 + vaesenclast 160(%rdx),%xmm1,%xmm1 + + vmovdqa %xmm1,(%rsi) + + .byte 0xf3,0xc3 + + +.globl _aes256gcmsiv_aes_ks_enc_x1 +.private_extern _aes256gcmsiv_aes_ks_enc_x1 + +.p2align 4 +_aes256gcmsiv_aes_ks_enc_x1: + + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + vmovdqa (%rdi),%xmm8 + vmovdqa (%rcx),%xmm1 + vmovdqa 16(%rcx),%xmm3 + vpxor %xmm1,%xmm8,%xmm8 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm1,(%rdx) + vmovdqu %xmm3,16(%rdx) + vpxor %xmm14,%xmm14,%xmm14 + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,32(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,48(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,64(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,80(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,96(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,112(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,128(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,144(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,160(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,176(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,192(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,208(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenclast %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,224(%rdx) + + vmovdqa %xmm8,(%rsi) + .byte 0xf3,0xc3 + + +.globl _aes256gcmsiv_ecb_enc_block +.private_extern _aes256gcmsiv_ecb_enc_block + +.p2align 4 +_aes256gcmsiv_ecb_enc_block: + + vmovdqa (%rdi),%xmm1 + vpxor (%rdx),%xmm1,%xmm1 + vaesenc 16(%rdx),%xmm1,%xmm1 + vaesenc 32(%rdx),%xmm1,%xmm1 + vaesenc 48(%rdx),%xmm1,%xmm1 + vaesenc 64(%rdx),%xmm1,%xmm1 + vaesenc 80(%rdx),%xmm1,%xmm1 + vaesenc 96(%rdx),%xmm1,%xmm1 + vaesenc 112(%rdx),%xmm1,%xmm1 + vaesenc 128(%rdx),%xmm1,%xmm1 + vaesenc 144(%rdx),%xmm1,%xmm1 + vaesenc 160(%rdx),%xmm1,%xmm1 + vaesenc 176(%rdx),%xmm1,%xmm1 + vaesenc 192(%rdx),%xmm1,%xmm1 + vaesenc 208(%rdx),%xmm1,%xmm1 + vaesenclast 224(%rdx),%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + .byte 0xf3,0xc3 + + +.globl _aes256gcmsiv_enc_msg_x4 +.private_extern _aes256gcmsiv_enc_msg_x4 + +.p2align 4 +_aes256gcmsiv_enc_msg_x4: + + testq %r8,%r8 + jnz L$256_enc_msg_x4_start + .byte 0xf3,0xc3 + +L$256_enc_msg_x4_start: + movq %r8,%r10 + shrq $4,%r8 + shlq $60,%r10 + jz L$256_enc_msg_x4_start2 + addq $1,%r8 + +L$256_enc_msg_x4_start2: + movq %r8,%r10 + shlq $62,%r10 + shrq $62,%r10 + + + vmovdqa (%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + + vmovdqa four(%rip),%xmm4 + vmovdqa %xmm15,%xmm0 + vpaddd one(%rip),%xmm15,%xmm1 + vpaddd two(%rip),%xmm15,%xmm2 + vpaddd three(%rip),%xmm15,%xmm3 + + shrq $2,%r8 + je L$256_enc_msg_x4_check_remainder + + subq $64,%rsi + subq $64,%rdi + +L$256_enc_msg_x4_loop1: + addq $64,%rsi + addq $64,%rdi + + vmovdqa %xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vmovdqa %xmm2,%xmm7 + vmovdqa %xmm3,%xmm8 + + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqu 32(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm1,%xmm1 + vmovdqu 48(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm2,%xmm2 + vmovdqu 64(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm3,%xmm3 + + vmovdqu 80(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 96(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 112(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 128(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 144(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 176(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 192(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 208(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 224(%rcx),%xmm12 + vaesenclast %xmm12,%xmm5,%xmm5 + vaesenclast %xmm12,%xmm6,%xmm6 + vaesenclast %xmm12,%xmm7,%xmm7 + vaesenclast %xmm12,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm5,%xmm5 + vpxor 16(%rdi),%xmm6,%xmm6 + vpxor 32(%rdi),%xmm7,%xmm7 + vpxor 48(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm5,0(%rsi) + vmovdqu %xmm6,16(%rsi) + vmovdqu %xmm7,32(%rsi) + vmovdqu %xmm8,48(%rsi) + + jne L$256_enc_msg_x4_loop1 + + addq $64,%rsi + addq $64,%rdi + +L$256_enc_msg_x4_check_remainder: + cmpq $0,%r10 + je L$256_enc_msg_x4_out + +L$256_enc_msg_x4_loop2: + + + + vmovdqa %xmm0,%xmm5 + vpaddd one(%rip),%xmm0,%xmm0 + vpxor (%rcx),%xmm5,%xmm5 + vaesenc 16(%rcx),%xmm5,%xmm5 + vaesenc 32(%rcx),%xmm5,%xmm5 + vaesenc 48(%rcx),%xmm5,%xmm5 + vaesenc 64(%rcx),%xmm5,%xmm5 + vaesenc 80(%rcx),%xmm5,%xmm5 + vaesenc 96(%rcx),%xmm5,%xmm5 + vaesenc 112(%rcx),%xmm5,%xmm5 + vaesenc 128(%rcx),%xmm5,%xmm5 + vaesenc 144(%rcx),%xmm5,%xmm5 + vaesenc 160(%rcx),%xmm5,%xmm5 + vaesenc 176(%rcx),%xmm5,%xmm5 + vaesenc 192(%rcx),%xmm5,%xmm5 + vaesenc 208(%rcx),%xmm5,%xmm5 + vaesenclast 224(%rcx),%xmm5,%xmm5 + + + vpxor (%rdi),%xmm5,%xmm5 + + vmovdqu %xmm5,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + subq $1,%r10 + jne L$256_enc_msg_x4_loop2 + +L$256_enc_msg_x4_out: + .byte 0xf3,0xc3 + + +.globl _aes256gcmsiv_enc_msg_x8 +.private_extern _aes256gcmsiv_enc_msg_x8 + +.p2align 4 +_aes256gcmsiv_enc_msg_x8: + + testq %r8,%r8 + jnz L$256_enc_msg_x8_start + .byte 0xf3,0xc3 + +L$256_enc_msg_x8_start: + + movq %rsp,%r11 + subq $16,%r11 + andq $-64,%r11 + + movq %r8,%r10 + shrq $4,%r8 + shlq $60,%r10 + jz L$256_enc_msg_x8_start2 + addq $1,%r8 + +L$256_enc_msg_x8_start2: + movq %r8,%r10 + shlq $61,%r10 + shrq $61,%r10 + + + vmovdqa (%rdx),%xmm1 + vpor OR_MASK(%rip),%xmm1,%xmm1 + + + vpaddd seven(%rip),%xmm1,%xmm0 + vmovdqa %xmm0,(%r11) + vpaddd one(%rip),%xmm1,%xmm9 + vpaddd two(%rip),%xmm1,%xmm10 + vpaddd three(%rip),%xmm1,%xmm11 + vpaddd four(%rip),%xmm1,%xmm12 + vpaddd five(%rip),%xmm1,%xmm13 + vpaddd six(%rip),%xmm1,%xmm14 + vmovdqa %xmm1,%xmm0 + + shrq $3,%r8 + jz L$256_enc_msg_x8_check_remainder + + subq $128,%rsi + subq $128,%rdi + +L$256_enc_msg_x8_loop1: + addq $128,%rsi + addq $128,%rdi + + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm9,%xmm2 + vmovdqa %xmm10,%xmm3 + vmovdqa %xmm11,%xmm4 + vmovdqa %xmm12,%xmm5 + vmovdqa %xmm13,%xmm6 + vmovdqa %xmm14,%xmm7 + + vmovdqa (%r11),%xmm8 + + vpxor (%rcx),%xmm1,%xmm1 + vpxor (%rcx),%xmm2,%xmm2 + vpxor (%rcx),%xmm3,%xmm3 + vpxor (%rcx),%xmm4,%xmm4 + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqa (%r11),%xmm14 + vpaddd eight(%rip),%xmm14,%xmm14 + vmovdqa %xmm14,(%r11) + vmovdqu 32(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpsubd one(%rip),%xmm14,%xmm14 + vmovdqu 48(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm0,%xmm0 + vmovdqu 64(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm9,%xmm9 + vmovdqu 80(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm10,%xmm10 + vmovdqu 96(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm11,%xmm11 + vmovdqu 112(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm12,%xmm12 + vmovdqu 128(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm13,%xmm13 + vmovdqu 144(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 176(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 192(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 208(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 224(%rcx),%xmm15 + vaesenclast %xmm15,%xmm1,%xmm1 + vaesenclast %xmm15,%xmm2,%xmm2 + vaesenclast %xmm15,%xmm3,%xmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vaesenclast %xmm15,%xmm6,%xmm6 + vaesenclast %xmm15,%xmm7,%xmm7 + vaesenclast %xmm15,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm1,%xmm1 + vpxor 16(%rdi),%xmm2,%xmm2 + vpxor 32(%rdi),%xmm3,%xmm3 + vpxor 48(%rdi),%xmm4,%xmm4 + vpxor 64(%rdi),%xmm5,%xmm5 + vpxor 80(%rdi),%xmm6,%xmm6 + vpxor 96(%rdi),%xmm7,%xmm7 + vpxor 112(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm1,0(%rsi) + vmovdqu %xmm2,16(%rsi) + vmovdqu %xmm3,32(%rsi) + vmovdqu %xmm4,48(%rsi) + vmovdqu %xmm5,64(%rsi) + vmovdqu %xmm6,80(%rsi) + vmovdqu %xmm7,96(%rsi) + vmovdqu %xmm8,112(%rsi) + + jne L$256_enc_msg_x8_loop1 + + addq $128,%rsi + addq $128,%rdi + +L$256_enc_msg_x8_check_remainder: + cmpq $0,%r10 + je L$256_enc_msg_x8_out + +L$256_enc_msg_x8_loop2: + + + vmovdqa %xmm0,%xmm1 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm1,%xmm1 + vaesenc 16(%rcx),%xmm1,%xmm1 + vaesenc 32(%rcx),%xmm1,%xmm1 + vaesenc 48(%rcx),%xmm1,%xmm1 + vaesenc 64(%rcx),%xmm1,%xmm1 + vaesenc 80(%rcx),%xmm1,%xmm1 + vaesenc 96(%rcx),%xmm1,%xmm1 + vaesenc 112(%rcx),%xmm1,%xmm1 + vaesenc 128(%rcx),%xmm1,%xmm1 + vaesenc 144(%rcx),%xmm1,%xmm1 + vaesenc 160(%rcx),%xmm1,%xmm1 + vaesenc 176(%rcx),%xmm1,%xmm1 + vaesenc 192(%rcx),%xmm1,%xmm1 + vaesenc 208(%rcx),%xmm1,%xmm1 + vaesenclast 224(%rcx),%xmm1,%xmm1 + + + vpxor (%rdi),%xmm1,%xmm1 + + vmovdqu %xmm1,(%rsi) + + addq $16,%rdi + addq $16,%rsi + subq $1,%r10 + jnz L$256_enc_msg_x8_loop2 + +L$256_enc_msg_x8_out: + .byte 0xf3,0xc3 + + + +.globl _aes256gcmsiv_dec +.private_extern _aes256gcmsiv_dec + +.p2align 4 +_aes256gcmsiv_dec: + + testq $~15,%r9 + jnz L$256_dec_start + .byte 0xf3,0xc3 + +L$256_dec_start: + vzeroupper + vmovdqa (%rdx),%xmm0 + movq %rdx,%rax + + leaq 32(%rax),%rax + leaq 32(%rcx),%rcx + + + vmovdqu (%rdi,%r9,1),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + andq $~15,%r9 + + + cmpq $96,%r9 + jb L$256_dec_loop2 + + + subq $96,%r9 + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vpxor (%r8),%xmm7,%xmm7 + vpxor (%r8),%xmm8,%xmm8 + vpxor (%r8),%xmm9,%xmm9 + vpxor (%r8),%xmm10,%xmm10 + vpxor (%r8),%xmm11,%xmm11 + vpxor (%r8),%xmm12,%xmm12 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 176(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 192(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 208(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 224(%r8),%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vaesenclast %xmm4,%xmm8,%xmm8 + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm4,%xmm10,%xmm10 + vaesenclast %xmm4,%xmm11,%xmm11 + vaesenclast %xmm4,%xmm12,%xmm12 + + + vpxor 0(%rdi),%xmm7,%xmm7 + vpxor 16(%rdi),%xmm8,%xmm8 + vpxor 32(%rdi),%xmm9,%xmm9 + vpxor 48(%rdi),%xmm10,%xmm10 + vpxor 64(%rdi),%xmm11,%xmm11 + vpxor 80(%rdi),%xmm12,%xmm12 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + addq $96,%rdi + addq $96,%rsi + jmp L$256_dec_loop1 + + +.p2align 6 +L$256_dec_loop1: + cmpq $96,%r9 + jb L$256_dec_finish_96 + subq $96,%r9 + + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vmovdqa (%r8),%xmm4 + vpxor %xmm4,%xmm7,%xmm7 + vpxor %xmm4,%xmm8,%xmm8 + vpxor %xmm4,%xmm9,%xmm9 + vpxor %xmm4,%xmm10,%xmm10 + vpxor %xmm4,%xmm11,%xmm11 + vpxor %xmm4,%xmm12,%xmm12 + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vmovdqa 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 176(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 192(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 208(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 224(%r8),%xmm6 + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor 0(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vpxor 16(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm8,%xmm8 + vpxor 32(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm9,%xmm9 + vpxor 48(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm10,%xmm10 + vpxor 64(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm11,%xmm11 + vpxor 80(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm12,%xmm12 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + vpxor %xmm5,%xmm0,%xmm0 + + leaq 96(%rdi),%rdi + leaq 96(%rsi),%rsi + jmp L$256_dec_loop1 + +L$256_dec_finish_96: + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor %xmm5,%xmm0,%xmm0 + +L$256_dec_loop2: + + + + cmpq $16,%r9 + jb L$256_dec_out + subq $16,%r9 + + vmovdqa %xmm15,%xmm2 + vpaddd one(%rip),%xmm15,%xmm15 + + vpxor 0(%r8),%xmm2,%xmm2 + vaesenc 16(%r8),%xmm2,%xmm2 + vaesenc 32(%r8),%xmm2,%xmm2 + vaesenc 48(%r8),%xmm2,%xmm2 + vaesenc 64(%r8),%xmm2,%xmm2 + vaesenc 80(%r8),%xmm2,%xmm2 + vaesenc 96(%r8),%xmm2,%xmm2 + vaesenc 112(%r8),%xmm2,%xmm2 + vaesenc 128(%r8),%xmm2,%xmm2 + vaesenc 144(%r8),%xmm2,%xmm2 + vaesenc 160(%r8),%xmm2,%xmm2 + vaesenc 176(%r8),%xmm2,%xmm2 + vaesenc 192(%r8),%xmm2,%xmm2 + vaesenc 208(%r8),%xmm2,%xmm2 + vaesenclast 224(%r8),%xmm2,%xmm2 + vpxor (%rdi),%xmm2,%xmm2 + vmovdqu %xmm2,(%rsi) + addq $16,%rdi + addq $16,%rsi + + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa -32(%rcx),%xmm1 + call GFMUL + + jmp L$256_dec_loop2 + +L$256_dec_out: + vmovdqu %xmm0,(%rdx) + .byte 0xf3,0xc3 + + +.globl _aes256gcmsiv_kdf +.private_extern _aes256gcmsiv_kdf + +.p2align 4 +_aes256gcmsiv_kdf: + + + + + + vmovdqa (%rdx),%xmm1 + vmovdqa 0(%rdi),%xmm4 + vmovdqa and_mask(%rip),%xmm11 + vmovdqa one(%rip),%xmm8 + vpshufd $0x90,%xmm4,%xmm4 + vpand %xmm11,%xmm4,%xmm4 + vpaddd %xmm8,%xmm4,%xmm6 + vpaddd %xmm8,%xmm6,%xmm7 + vpaddd %xmm8,%xmm7,%xmm11 + vpaddd %xmm8,%xmm11,%xmm12 + vpaddd %xmm8,%xmm12,%xmm13 + + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm1,%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm1,%xmm11,%xmm11 + vpxor %xmm1,%xmm12,%xmm12 + vpxor %xmm1,%xmm13,%xmm13 + + vmovdqa 16(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 32(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 48(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 64(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 80(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 96(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 112(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 128(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 144(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 160(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 176(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 192(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 208(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 224(%rdx),%xmm2 + vaesenclast %xmm2,%xmm4,%xmm4 + vaesenclast %xmm2,%xmm6,%xmm6 + vaesenclast %xmm2,%xmm7,%xmm7 + vaesenclast %xmm2,%xmm11,%xmm11 + vaesenclast %xmm2,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + + + vmovdqa %xmm4,0(%rsi) + vmovdqa %xmm6,16(%rsi) + vmovdqa %xmm7,32(%rsi) + vmovdqa %xmm11,48(%rsi) + vmovdqa %xmm12,64(%rsi) + vmovdqa %xmm13,80(%rsi) + .byte 0xf3,0xc3 + + +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S new file mode 100644 index 0000000000..c90447ac45 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S @@ -0,0 +1,8973 @@ +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +.text + + +chacha20_poly1305_constants: + +.p2align 6 +.chacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.rol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.rol16: +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.avx2_init: +.long 0,0,0,0 +.sse_inc: +.long 1,0,0,0 +.avx2_inc: +.long 2,0,0,0,2,0,0,0 +.clamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC +.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF +.p2align 4 +.and_masks: +.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + + +.p2align 6 +poly_hash_ad_internal: + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + cmpq $13,%r8 + jne hash_ad_loop +poly_fast_tls_ad: + + movq (%rcx),%r10 + movq 5(%rcx),%r11 + shrq $24,%r11 + movq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + .byte 0xf3,0xc3 +hash_ad_loop: + + cmpq $16,%r8 + jb hash_ad_tail + addq 0(%rcx),%r10 + adcq 8+0(%rcx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rcx),%rcx + subq $16,%r8 + jmp hash_ad_loop +hash_ad_tail: + cmpq $0,%r8 + je 1f + + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + addq %r8,%rcx +hash_ad_tail_loop: + shldq $8,%r13,%r14 + shlq $8,%r13 + movzbq -1(%rcx),%r15 + xorq %r15,%r13 + decq %rcx + decq %r8 + jne hash_ad_tail_loop + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +1: + .byte 0xf3,0xc3 + + + +.globl _chacha20_poly1305_open +.private_extern _chacha20_poly1305_open + +.p2align 6 +_chacha20_poly1305_open: + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + + pushq %r9 + + subq $288 + 32,%rsp + + + + + + + + leaq 32(%rsp),%rbp + andq $-32,%rbp + movq %rdx,8+32(%rbp) + movq %r8,0+32(%rbp) + movq %rdx,%rbx + + movl _OPENSSL_ia32cap_P+8(%rip),%eax + andl $288,%eax + xorl $288,%eax + jz chacha20_poly1305_open_avx2 + +1: + cmpq $128,%rbx + jbe open_sse_128 + + movdqa .chacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm7 + + movdqa %xmm4,48(%rbp) + movdqa %xmm8,64(%rbp) + movdqa %xmm12,96(%rbp) + movq $10,%r10 +1: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %r10 + jne 1b + + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + + pand .clamp(%rip),%xmm0 + movdqa %xmm0,0(%rbp) + movdqa %xmm4,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +open_sse_main_loop: + cmpq $256,%rbx + jb 2f + + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 96(%rbp),%xmm15 + paddd .sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + + + + movq $4,%rcx + movq %rsi,%r8 +1: + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + + leaq 16(%r8),%r8 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %rcx + jge 1b + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + cmpq $-6,%rcx + jg 1b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqa %xmm12,80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor 80(%rbp),%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp open_sse_main_loop +2: + + testq %rbx,%rbx + jz open_sse_finalize + cmpq $64,%rbx + ja 3f + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa 96(%rbp),%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + cmpq $16,%rcx + jb 2f +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx +2: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + cmpq $16,%rcx + jae 1b + cmpq $160,%r8 + jne 2b + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + jmp open_sse_tail_64_dec_loop +3: + cmpq $128,%rbx + ja 3f + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 96(%rbp),%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + + movq %rbx,%rcx + andq $-16,%rcx + xorq %r8,%r8 +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +2: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + + cmpq %rcx,%r8 + jb 1b + cmpq $160,%r8 + jne 2b + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + subq $64,%rbx + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jmp open_sse_tail_64_dec_loop +3: + cmpq $192,%rbx + ja 3f + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 96(%rbp),%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + + movq %rbx,%rcx + movq $160,%r8 + cmpq $160,%rcx + cmovgq %r8,%rcx + andq $-16,%rcx + xorq %r8,%r8 +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +2: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + cmpq %rcx,%r8 + jb 1b + cmpq $160,%r8 + jne 2b + cmpq $176,%rbx + jb 1f + addq 160(%rsi),%r10 + adcq 8+160(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + cmpq $192,%rbx + jb 1f + addq 176(%rsi),%r10 + adcq 8+176(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +1: + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + subq $128,%rbx + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + jmp open_sse_tail_64_dec_loop +3: + + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 96(%rbp),%xmm15 + paddd .sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + + xorq %r8,%r8 +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movdqa %xmm11,80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + movdqa 80(%rbp),%xmm11 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa %xmm9,80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 + movdqa 80(%rbp),%xmm9 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + movdqa %xmm11,80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + movdqa 80(%rbp),%xmm11 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + movdqa %xmm9,80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .rol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 + movdqa 80(%rbp),%xmm9 + + addq $16,%r8 + cmpq $160,%r8 + jb 1b + movq %rbx,%rcx + andq $-16,%rcx +1: + addq 0(%rsi,%r8), %r10 + adcq 8+0(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%r8 + cmpq %rcx,%r8 + jb 1b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqa %xmm12,80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movdqa 80(%rbp),%xmm12 + subq $192,%rbx + leaq 192(%rsi),%rsi + leaq 192(%rdi),%rdi + + +open_sse_tail_64_dec_loop: + cmpq $16,%rbx + jb 1f + subq $16,%rbx + movdqu (%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + jmp open_sse_tail_64_dec_loop +1: + movdqa %xmm0,%xmm1 + + +open_sse_tail_16: + testq %rbx,%rbx + jz open_sse_finalize + + + + pxor %xmm3,%xmm3 + leaq -1(%rsi,%rbx), %rsi + movq %rbx,%r8 +2: + pslldq $1,%xmm3 + pinsrb $0,(%rsi),%xmm3 + subq $1,%rsi + subq $1,%r8 + jnz 2b + +3: +.byte 102,73,15,126,221 + pextrq $1,%xmm3,%r14 + + pxor %xmm1,%xmm3 + + +2: + pextrb $0,%xmm3,(%rdi) + psrldq $1,%xmm3 + addq $1,%rdi + subq $1,%rbx + jne 2b + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +open_sse_finalize: + addq 32(%rbp),%r10 + adcq 8+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+16(%rbp),%r10 + adcq 8+16(%rbp),%r11 + + addq $288 + 32,%rsp + + popq %r9 + + movq %r10,(%r9) + movq %r11,8(%r9) + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbx + + popq %rbp + + .byte 0xf3,0xc3 + + +open_sse_128: + movdqu .chacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm13,%xmm15 + movq $10,%r10 +1: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + decq %r10 + jnz 1b + paddd .chacha20_consts(%rip),%xmm0 + paddd .chacha20_consts(%rip),%xmm1 + paddd .chacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm9 + paddd %xmm11,%xmm10 + paddd %xmm15,%xmm13 + paddd .sse_inc(%rip),%xmm15 + paddd %xmm15,%xmm14 + + pand .clamp(%rip),%xmm0 + movdqa %xmm0,0(%rbp) + movdqa %xmm4,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +1: + cmpq $16,%rbx + jb open_sse_tail_16 + subq $16,%rbx + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm1 + movdqu %xmm1,0(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + movdqa %xmm2,%xmm13 + movdqa %xmm6,%xmm2 + movdqa %xmm10,%xmm6 + movdqa %xmm14,%xmm10 + jmp 1b + jmp open_sse_tail_16 + + + + + + +.globl _chacha20_poly1305_seal +.private_extern _chacha20_poly1305_seal + +.p2align 6 +_chacha20_poly1305_seal: + + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + + pushq %r9 + + subq $288 + 32,%rsp + + + + + + + + leaq 32(%rsp),%rbp + andq $-32,%rbp + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %rbx,8+32(%rbp) + movq %r8,0+32(%rbp) + movq %rdx,%rbx + + movl _OPENSSL_ia32cap_P+8(%rip),%eax + andl $288,%eax + xorl $288,%eax + jz chacha20_poly1305_seal_avx2 + + cmpq $128,%rbx + jbe seal_sse_128 + + movdqa .chacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm14 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd .sse_inc(%rip),%xmm12 + + movdqa %xmm4,48(%rbp) + movdqa %xmm8,64(%rbp) + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + movq $10,%r10 +1: + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + decq %r10 + jnz 1b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + + pand .clamp(%rip),%xmm3 + movdqa %xmm3,0(%rbp) + movdqa %xmm7,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + cmpq $192,%rbx + ja 1f + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + jmp seal_sse_128_seal_hash +1: + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 128(%rdi) + movdqu %xmm4,16 + 128(%rdi) + movdqu %xmm8,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + movq $2,%rcx + movq $8,%r8 + cmpq $64,%rbx + jbe seal_sse_tail_64 + cmpq $128,%rbx + jbe seal_sse_tail_128 + cmpq $192,%rbx + jbe seal_sse_tail_192 + +1: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 96(%rbp),%xmm15 + paddd .sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + movdqa %xmm15,144(%rbp) + +2: + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 +.byte 102,15,58,15,255,4 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,12 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + movdqa %xmm8,80(%rbp) + movdqa .rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 +.byte 102,69,15,56,0,248 +.byte 102,69,15,56,0,240 +.byte 102,69,15,56,0,232 +.byte 102,69,15,56,0,224 + movdqa 80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 80(%rbp),%xmm8 +.byte 102,15,58,15,255,12 +.byte 102,69,15,58,15,219,8 +.byte 102,69,15,58,15,255,4 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + + leaq 16(%rdi),%rdi + decq %r8 + jge 2b + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg 2b + paddd .chacha20_consts(%rip),%xmm3 + paddd 48(%rbp),%xmm7 + paddd 64(%rbp),%xmm11 + paddd 144(%rbp),%xmm15 + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + movdqa %xmm14,80(%rbp) + movdqa %xmm14,80(%rbp) + movdqu 0 + 0(%rsi),%xmm14 + pxor %xmm3,%xmm14 + movdqu %xmm14,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm14 + pxor %xmm7,%xmm14 + movdqu %xmm14,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm14 + pxor %xmm11,%xmm14 + movdqu %xmm14,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm14 + pxor %xmm15,%xmm14 + movdqu %xmm14,48 + 0(%rdi) + + movdqa 80(%rbp),%xmm14 + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + cmpq $256,%rbx + ja 3f + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + jmp seal_sse_128_seal_hash +3: + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + subq $256,%rbx + movq $6,%rcx + movq $4,%r8 + cmpq $192,%rbx + jg 1b + movq %rbx,%rcx + testq %rbx,%rbx + je seal_sse_128_seal_hash + movq $6,%rcx + cmpq $64,%rbx + jg 3f + +seal_sse_tail_64: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa 96(%rbp),%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + + jmp seal_sse_128_seal +3: + cmpq $128,%rbx + jg 3f + +seal_sse_tail_128: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 96(%rbp),%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + + leaq 16(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + movq $64,%rcx + subq $64,%rbx + leaq 64(%rsi),%rsi + jmp seal_sse_128_seal_hash +3: + +seal_sse_tail_192: + movdqa .chacha20_consts(%rip),%xmm0 + movdqa 48(%rbp),%xmm4 + movdqa 64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 96(%rbp),%xmm14 + paddd .sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,96(%rbp) + movdqa %xmm13,112(%rbp) + movdqa %xmm14,128(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + leaq 16(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + paddd .chacha20_consts(%rip),%xmm2 + paddd 48(%rbp),%xmm6 + paddd 64(%rbp),%xmm10 + paddd 128(%rbp),%xmm14 + paddd .chacha20_consts(%rip),%xmm1 + paddd 48(%rbp),%xmm5 + paddd 64(%rbp),%xmm9 + paddd 112(%rbp),%xmm13 + paddd .chacha20_consts(%rip),%xmm0 + paddd 48(%rbp),%xmm4 + paddd 64(%rbp),%xmm8 + paddd 96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + +seal_sse_128_seal_hash: + cmpq $16,%rcx + jb seal_sse_128_seal + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + leaq 16(%rdi),%rdi + jmp seal_sse_128_seal_hash + +seal_sse_128_seal: + cmpq $16,%rbx + jb seal_sse_tail_16 + subq $16,%rbx + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,0(%rdi) + + addq 0(%rdi),%r10 + adcq 8(%rdi),%r11 + adcq $1,%r12 + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + movdqa %xmm1,%xmm12 + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + jmp seal_sse_128_seal + +seal_sse_tail_16: + testq %rbx,%rbx + jz process_blocks_of_extra_in + + movq %rbx,%r8 + movq %rbx,%rcx + leaq -1(%rsi,%rbx), %rsi + pxor %xmm15,%xmm15 +1: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + decq %rcx + jne 1b + + + pxor %xmm0,%xmm15 + + + movq %rbx,%rcx + movdqu %xmm15,%xmm0 +2: + pextrb $0,%xmm0,(%rdi) + psrldq $1,%xmm0 + addq $1,%rdi + subq $1,%rcx + jnz 2b + + + + + + + + + movq 288+32(%rsp),%r9 + movq 56(%r9),%r14 + movq 48(%r9),%r13 + testq %r14,%r14 + jz process_partial_block + + movq $16,%r15 + subq %rbx,%r15 + cmpq %r15,%r14 + + jge load_extra_in + movq %r14,%r15 + +load_extra_in: + + + leaq -1(%r13,%r15), %rsi + + + addq %r15,%r13 + subq %r15,%r14 + movq %r13,48(%r9) + movq %r14,56(%r9) + + + + addq %r15,%r8 + + + pxor %xmm11,%xmm11 +3: + pslldq $1,%xmm11 + pinsrb $0,(%rsi),%xmm11 + leaq -1(%rsi),%rsi + subq $1,%r15 + jnz 3b + + + + + movq %rbx,%r15 + +4: + pslldq $1,%xmm11 + subq $1,%r15 + jnz 4b + + + + + leaq .and_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx), %xmm15 + + + por %xmm11,%xmm15 + + + +.byte 102,77,15,126,253 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +process_blocks_of_extra_in: + + movq 288+32(%rsp),%r9 + movq 48(%r9),%rsi + movq 56(%r9),%r8 + movq %r8,%rcx + shrq $4,%r8 + +5: + jz process_extra_in_trailer + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rsi),%rsi + subq $1,%r8 + jmp 5b + +process_extra_in_trailer: + andq $15,%rcx + movq %rcx,%rbx + jz do_length_block + leaq -1(%rsi,%rcx), %rsi + +6: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + subq $1,%rcx + jnz 6b + +process_partial_block: + + leaq .and_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx), %xmm15 +.byte 102,77,15,126,253 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +do_length_block: + addq 32(%rbp),%r10 + adcq 8+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+16(%rbp),%r10 + adcq 8+16(%rbp),%r11 + + addq $288 + 32,%rsp + + popq %r9 + + movq %r10,0(%r9) + movq %r11,8(%r9) + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbx + + popq %rbp + + .byte 0xf3,0xc3 + + +seal_sse_128: + movdqu .chacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm14 + movdqa %xmm14,%xmm12 + paddd .sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd .sse_inc(%rip),%xmm13 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + movq $10,%r10 +1: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,4 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,4 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,12 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,4 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 +.byte 102,15,58,15,228,12 +.byte 102,69,15,58,15,192,8 +.byte 102,69,15,58,15,228,4 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 +.byte 102,15,58,15,237,12 +.byte 102,69,15,58,15,201,8 +.byte 102,69,15,58,15,237,4 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 +.byte 102,15,58,15,246,12 +.byte 102,69,15,58,15,210,8 +.byte 102,69,15,58,15,246,4 + + decq %r10 + jnz 1b + paddd .chacha20_consts(%rip),%xmm0 + paddd .chacha20_consts(%rip),%xmm1 + paddd .chacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm8 + paddd %xmm11,%xmm9 + paddd %xmm15,%xmm12 + paddd .sse_inc(%rip),%xmm15 + paddd %xmm15,%xmm13 + + pand .clamp(%rip),%xmm2 + movdqa %xmm2,0(%rbp) + movdqa %xmm6,16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + jmp seal_sse_128_seal + + + + +.p2align 6 +chacha20_poly1305_open_avx2: + vzeroupper + vmovdqa .chacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd .avx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe open_avx2_192 + cmpq $320,%rbx + jbe open_avx2_320 + + vmovdqa %ymm4,64(%rbp) + vmovdqa %ymm8,96(%rbp) + vmovdqa %ymm12,160(%rbp) + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + decq %r10 + jne 1b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + + movq %r8,%r8 + call poly_hash_ad_internal + xorq %rcx,%rcx + +1: + addq 0(%rsi,%rcx), %r10 + adcq 8+0(%rsi,%rcx), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%rcx + cmpq $64,%rcx + jne 1b + + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm4,32(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + subq $64,%rbx +1: + + cmpq $512,%rbx + jb 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + + xorq %rcx,%rcx +2: + addq 0*8(%rsi,%rcx), %r10 + adcq 8+0*8(%rsi,%rcx), %r11 + adcq $1,%r12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + addq %rax,%r15 + adcq %rdx,%r9 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + addq 2*8(%rsi,%rcx), %r10 + adcq 8+2*8(%rsi,%rcx), %r11 + adcq $1,%r12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + addq 4*8(%rsi,%rcx), %r10 + adcq 8+4*8(%rsi,%rcx), %r11 + adcq $1,%r12 + + leaq 48(%rcx),%rcx + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + cmpq $60*8,%rcx + jne 2b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,128(%rbp) + addq 60*8(%rsi),%r10 + adcq 8+60*8(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + addq 60*8+16(%rsi),%r10 + adcq 8+60*8+16(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + leaq 512(%rdi),%rdi + subq $512,%rbx + jmp 1b +3: + testq %rbx,%rbx + vzeroupper + je open_sse_finalize +3: + cmpq $128,%rbx + ja 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + andq $-16,%rcx + testq %rcx,%rcx + je 2f +1: + addq 0*8(%rsi,%r8), %r10 + adcq 8+0*8(%rsi,%r8), %r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +2: + addq $16,%r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb 1b + cmpq $160,%r8 + jne 2b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp open_avx2_tail_loop +3: + cmpq $256,%rbx + ja 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + + movq %rbx,128(%rbp) + movq %rbx,%rcx + subq $128,%rcx + shrq $4,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +1: + addq 0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + + incq %r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + cmpq %rcx,%r8 + jb 1b + cmpq $10,%r8 + jne 2b + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 128(%rbp),%rbx +1: + addq $16,%rcx + cmpq %rbx,%rcx + jg 1f + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp 1b +1: + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + subq $128,%rbx + jmp open_avx2_tail_loop +3: + cmpq $384,%rbx + ja 3f + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + + movq %rbx,128(%rbp) + movq %rbx,%rcx + subq $256,%rcx + shrq $4,%rcx + addq $6,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +1: + addq 0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +2: + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx + incq %r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb 1b + cmpq $10,%r8 + jne 2b + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 128(%rbp),%rbx +1: + addq $16,%rcx + cmpq %rbx,%rcx + jg 1f + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp 1b +1: + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp open_avx2_tail_loop +3: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + + xorq %rcx,%rcx + movq %rsi,%r8 +1: + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 +2: + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + addq 16(%r8),%r10 + adcq 8+16(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%r8),%r8 + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + incq %rcx + cmpq $4,%rcx + jl 1b + cmpq $10,%rcx + jne 2b + movq %rbx,%rcx + subq $384,%rcx + andq $-16,%rcx +1: + testq %rcx,%rcx + je 1f + addq 0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + subq $16,%rcx + jmp 1b +1: + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 384(%rsi),%rsi + leaq 384(%rdi),%rdi + subq $384,%rbx +open_avx2_tail_loop: + cmpq $32,%rbx + jb open_avx2_tail + subq $32,%rbx + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + jmp open_avx2_tail_loop +open_avx2_tail: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb 1f + subq $16,%rbx + + vpxor (%rsi),%xmm0,%xmm1 + vmovdqu %xmm1,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 + vmovdqa %xmm0,%xmm1 +1: + vzeroupper + jmp open_sse_tail_16 + +open_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne 1b + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +open_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal +open_avx2_hash_and_xor_loop: + cmpq $32,%rbx + jb open_avx2_short_tail_32 + subq $32,%rbx + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 16(%rsi),%r10 + adcq 8+16(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp open_avx2_hash_and_xor_loop +open_avx2_short_tail_32: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb 1f + subq $16,%rbx + addq 0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm1 +1: + vzeroupper + jmp open_sse_tail_16 + +open_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vpaddd .avx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne 1b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 160(%rbp),%ymm12,%ymm12 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd 224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp open_avx2_short + + + + +.p2align 6 +chacha20_poly1305_seal_avx2: + vzeroupper + vmovdqa .chacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd .avx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe seal_avx2_192 + cmpq $320,%rbx + jbe seal_avx2_320 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm4,64(%rbp) + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm8,96(%rbp) + vmovdqa %ymm12,%ymm15 + vpaddd .avx2_inc(%rip),%ymm15,%ymm14 + vpaddd .avx2_inc(%rip),%ymm14,%ymm13 + vpaddd .avx2_inc(%rip),%ymm13,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm15,256(%rbp) + movq $10,%r10 +1: + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %r10 + jnz 1b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 + vpand .clamp(%rip),%ymm15,%ymm15 + vmovdqa %ymm15,0(%rbp) + movq %r8,%r8 + call poly_hash_ad_internal + + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm11,32(%rdi) + vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+64(%rsi),%ymm15,%ymm15 + vpxor 32+64(%rsi),%ymm2,%ymm2 + vpxor 64+64(%rsi),%ymm6,%ymm6 + vpxor 96+64(%rsi),%ymm10,%ymm10 + vmovdqu %ymm15,0+64(%rdi) + vmovdqu %ymm2,32+64(%rdi) + vmovdqu %ymm6,64+64(%rdi) + vmovdqu %ymm10,96+64(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+192(%rsi),%ymm15,%ymm15 + vpxor 32+192(%rsi),%ymm1,%ymm1 + vpxor 64+192(%rsi),%ymm5,%ymm5 + vpxor 96+192(%rsi),%ymm9,%ymm9 + vmovdqu %ymm15,0+192(%rdi) + vmovdqu %ymm1,32+192(%rdi) + vmovdqu %ymm5,64+192(%rdi) + vmovdqu %ymm9,96+192(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm15,%ymm8 + + leaq 320(%rsi),%rsi + subq $320,%rbx + movq $320,%rcx + cmpq $128,%rbx + jbe seal_avx2_hash + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + vpxor 64(%rsi),%ymm8,%ymm8 + vpxor 96(%rsi),%ymm12,%ymm12 + vmovdqu %ymm0,320(%rdi) + vmovdqu %ymm4,352(%rdi) + vmovdqu %ymm8,384(%rdi) + vmovdqu %ymm12,416(%rdi) + leaq 128(%rsi),%rsi + subq $128,%rbx + movq $8,%rcx + movq $2,%r8 + cmpq $128,%rbx + jbe seal_avx2_tail_128 + cmpq $256,%rbx + jbe seal_avx2_tail_256 + cmpq $384,%rbx + jbe seal_avx2_tail_384 + cmpq $512,%rbx + jbe seal_avx2_tail_512 + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + + subq $16,%rdi + movq $9,%rcx + jmp 4f +1: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + + movq $10,%rcx +2: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + addq %rax,%r15 + adcq %rdx,%r9 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +4: + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + addq 32(%rdi),%r10 + adcq 8+32(%rdi),%r11 + adcq $1,%r12 + + leaq 48(%rdi),%rdi + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %rcx + jne 2b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + leaq 32(%rdi),%rdi + vmovdqa %ymm0,128(%rbp) + addq -32(%rdi),%r10 + adcq 8+-32(%rdi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + addq -16(%rdi),%r10 + adcq 8+-16(%rdi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + subq $512,%rbx + cmpq $512,%rbx + jg 1b + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + movq $10,%rcx + xorq %r8,%r8 + cmpq $128,%rbx + ja 3f + +seal_avx2_tail_128: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp seal_avx2_short_loop +3: + cmpq $256,%rbx + ja 3f + +seal_avx2_tail_256: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $128,%rcx + leaq 128(%rsi),%rsi + subq $128,%rbx + jmp seal_avx2_hash +3: + cmpq $384,%rbx + ja seal_avx2_tail_512 + +seal_avx2_tail_384: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $256,%rcx + leaq 256(%rsi),%rsi + subq $256,%rbx + jmp seal_avx2_hash + +seal_avx2_tail_512: + vmovdqa .chacha20_consts(%rip),%ymm0 + vmovdqa 64(%rbp),%ymm4 + vmovdqa 96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .avx2_inc(%rip),%ymm12 + vpaddd 160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,256(%rbp) + vmovdqa %ymm14,224(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm12,160(%rbp) + +1: + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +2: + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + addq %rax,%r15 + adcq %rdx,%r9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,128(%rbp) + vmovdqa .rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vmovdqa .rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vmovdqa 128(%rbp),%ymm8 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + movq 0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + movq 8+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + + + + + + + + + + + + addq %rax,%r15 + adcq %rdx,%r9 + + + + + + + + + + + + + + + + + + + + + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg 1b + decq %r8 + jge 2b + vpaddd .chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 64(%rbp),%ymm7,%ymm7 + vpaddd 96(%rbp),%ymm11,%ymm11 + vpaddd 256(%rbp),%ymm15,%ymm15 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 64(%rbp),%ymm6,%ymm6 + vpaddd 96(%rbp),%ymm10,%ymm10 + vpaddd 224(%rbp),%ymm14,%ymm14 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpaddd 96(%rbp),%ymm9,%ymm9 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 64(%rbp),%ymm4,%ymm4 + vpaddd 96(%rbp),%ymm8,%ymm8 + vpaddd 160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $384,%rcx + leaq 384(%rsi),%rsi + subq $384,%rbx + jmp seal_avx2_hash + +seal_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vpaddd .avx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,160(%rbp) + vmovdqa %ymm13,192(%rbp) + vmovdqa %ymm14,224(%rbp) + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne 1b + vpaddd .chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd .chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd .chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 160(%rbp),%ymm12,%ymm12 + vpaddd 192(%rbp),%ymm13,%ymm13 + vpaddd 224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp seal_avx2_short + +seal_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .avx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +1: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne 1b + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +seal_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal + xorq %rcx,%rcx +seal_avx2_hash: + cmpq $16,%rcx + jb seal_avx2_short_loop + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + addq $16,%rdi + jmp seal_avx2_hash +seal_avx2_short_loop: + cmpq $32,%rbx + jb seal_avx2_short_tail + subq $32,%rbx + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp seal_avx2_short_loop +seal_avx2_short_tail: + cmpq $16,%rbx + jb 1f + subq $16,%rbx + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + addq 0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r10 + adcq %r14,%r11 + adcq $0,%r12 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm0 +1: + vzeroupper + jmp seal_sse_tail_16 + +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/aes/aes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S similarity index 99% rename from packager/third_party/boringssl/mac-x86_64/crypto/aes/aes-x86_64.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S index b5d188a0f3..c7c4829fa0 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/aes/aes-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aes-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .p2align 4 @@ -332,6 +332,7 @@ L$enc_compact_done: .private_extern _asm_AES_encrypt _asm_AES_encrypt: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 @@ -340,7 +341,6 @@ _asm_AES_encrypt: pushq %r15 - movq %rsp,%r10 leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -350,7 +350,7 @@ _asm_AES_encrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) L$enc_prologue: movq %rdx,%r15 @@ -382,13 +382,13 @@ L$enc_prologue: movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$enc_epilogue: .byte 0xf3,0xc3 @@ -778,6 +778,7 @@ L$dec_compact_done: .private_extern _asm_AES_decrypt _asm_AES_decrypt: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 @@ -786,7 +787,6 @@ _asm_AES_decrypt: pushq %r15 - movq %rsp,%r10 leaq -63(%rdx),%rcx andq $-64,%rsp subq %rsp,%rcx @@ -796,7 +796,7 @@ _asm_AES_decrypt: subq $32,%rsp movq %rsi,16(%rsp) - movq %r10,24(%rsp) + movq %rax,24(%rsp) L$dec_prologue: movq %rdx,%r15 @@ -830,13 +830,13 @@ L$dec_prologue: movl %ecx,8(%r9) movl %edx,12(%r9) - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$dec_epilogue: .byte 0xf3,0xc3 @@ -1312,12 +1312,12 @@ L$cbc_prologue: movl %r9d,%r9d leaq L$AES_Te(%rip),%r14 + leaq L$AES_Td(%rip),%r10 cmpq $0,%r9 - jne L$cbc_picked_te - leaq L$AES_Td(%rip),%r14 -L$cbc_picked_te: + cmoveq %r10,%r14 - movl _OPENSSL_ia32cap_P(%rip),%r10d + leaq _OPENSSL_ia32cap_P(%rip),%r10 + movl (%r10),%r10d cmpq $512,%rdx jb L$cbc_slow_prologue testq $15,%rdx diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S new file mode 100644 index 0000000000..2513904cf1 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S @@ -0,0 +1,834 @@ +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +.text + + +.p2align 5 +_aesni_ctr32_ghash_6x: + + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp L$oop6x + +.p2align 5 +L$oop6x: + addl $100663296,%ebx + jc L$handle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +L$resume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + + + + + + + + + + + + + + + + + + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $11,%ebp + jb L$enc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + je L$enc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp L$enc_tail + +.p2align 5 +L$handle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp L$resume_ctr32 + +.p2align 5 +L$enc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%r10 + subq $0x6,%rdx + jc L$6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp L$oop6x + +L$6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + .byte 0xf3,0xc3 + + +.globl _aesni_gcm_decrypt +.private_extern _aesni_gcm_decrypt + +.p2align 5 +_aesni_gcm_decrypt: + + xorq %r10,%r10 + + + + cmpq $0x60,%rdx + jb L$gcm_dec_abort + + leaq (%rsp),%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq L$bswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r9),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + leaq 32+32(%r9),%r9 + movl 240-128(%rcx),%ebp + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc L$dec_no_key_aliasing + cmpq $768,%r15 + jnc L$dec_no_key_aliasing + subq %r15,%rsp +L$dec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + leaq (%rdi),%r14 + vmovdqu 64(%rdi),%xmm4 + + + + + + + + leaq -192(%rdi,%rdx,1),%r15 + + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %r10,%r10 + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + + call _aesni_ctr32_ghash_6x + + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + +L$gcm_dec_abort: + movq %r10,%rax + .byte 0xf3,0xc3 + + + +.p2align 5 +_aesni_ctr32_6x: + + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -1(%rbp),%r13 + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc L$handle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp L$oop_ctr32 + +.p2align 4 +L$oop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz L$oop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + .byte 0xf3,0xc3 +.p2align 5 +L$handle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp L$oop_ctr32 + + + +.globl _aesni_gcm_encrypt +.private_extern _aesni_gcm_encrypt + +.p2align 5 +_aesni_gcm_encrypt: + + xorq %r10,%r10 + + + + + cmpq $288,%rdx + jb L$gcm_enc_abort + + leaq (%rsp),%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq L$bswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 240-128(%rcx),%ebp + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc L$enc_no_key_aliasing + cmpq $768,%r15 + jnc L$enc_no_key_aliasing + subq %r15,%rsp +L$enc_no_key_aliasing: + + leaq (%rsi),%r14 + + + + + + + + + leaq -192(%rsi,%rdx,1),%r15 + + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + vmovdqu (%r9),%xmm8 + leaq 32+32(%r9),%r9 + subq $12,%rdx + movq $192,%r10 + vpshufb %xmm0,%xmm8,%xmm8 + + call _aesni_ctr32_ghash_6x + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + +L$gcm_enc_abort: + movq %r10,%rax + .byte 0xf3,0xc3 + + +.p2align 6 +L$bswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +L$poly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +L$one_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +L$two_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +L$one_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 6 +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/aes/aesni-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S similarity index 81% rename from packager/third_party/boringssl/mac-x86_64/crypto/aes/aesni-x86_64.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S index 3d98fa12b6..4ee0dc49c2 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/aes/aesni-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/aesni-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .globl _aesni_encrypt @@ -1031,11 +1031,10 @@ L$oop_enc1_7: .p2align 4 L$ctr32_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $128,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp @@ -1044,7 +1043,7 @@ L$ctr32_bulk: movdqu (%rcx),%xmm0 movl 12(%r8),%r8d pxor %xmm0,%xmm2 - movl 12(%rcx),%r11d + movl 12(%rcx),%ebp movdqa %xmm2,0(%rsp) bswapl %r8d movdqa %xmm2,%xmm3 @@ -1060,8 +1059,8 @@ L$ctr32_bulk: leaq 2(%r8),%rdx bswapl %eax bswapl %edx - xorl %r11d,%eax - xorl %r11d,%edx + xorl %ebp,%eax + xorl %ebp,%edx .byte 102,15,58,34,216,3 leaq 3(%r8),%rax movdqa %xmm3,16(%rsp) @@ -1070,25 +1069,26 @@ L$ctr32_bulk: movq %r10,%rdx leaq 4(%r8),%r10 movdqa %xmm4,32(%rsp) - xorl %r11d,%eax + xorl %ebp,%eax bswapl %r10d .byte 102,15,58,34,232,3 - xorl %r11d,%r10d + xorl %ebp,%r10d movdqa %xmm5,48(%rsp) leaq 5(%r8),%r9 movl %r10d,64+12(%rsp) bswapl %r9d leaq 6(%r8),%r10 movl 240(%rcx),%eax - xorl %r11d,%r9d + xorl %ebp,%r9d bswapl %r10d movl %r9d,80+12(%rsp) - xorl %r11d,%r10d + xorl %ebp,%r10d leaq 7(%r8),%r9 movl %r10d,96+12(%rsp) bswapl %r9d - movl _OPENSSL_ia32cap_P+4(%rip),%r10d - xorl %r11d,%r9d + leaq _OPENSSL_ia32cap_P(%rip),%r10 + movl 4(%r10),%r10d + xorl %ebp,%r9d andl $71303168,%r10d movl %r9d,112+12(%rsp) @@ -1112,7 +1112,7 @@ L$ctr32_bulk: L$ctr32_6x: shll $4,%eax movl $48,%r10d - bswapl %r11d + bswapl %ebp leaq 32(%rcx,%rax,1),%rcx subq %rax,%r10 jmp L$ctr32_loop6 @@ -1123,32 +1123,32 @@ L$ctr32_loop6: movups -48(%rcx,%r10,1),%xmm0 .byte 102,15,56,220,209 movl %r8d,%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,217 .byte 0x0f,0x38,0xf1,0x44,0x24,12 leal 1(%r8),%eax .byte 102,15,56,220,225 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,28 .byte 102,15,56,220,233 leal 2(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,241 .byte 0x0f,0x38,0xf1,0x44,0x24,44 leal 3(%r8),%eax .byte 102,15,56,220,249 movups -32(%rcx,%r10,1),%xmm1 - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,208 .byte 0x0f,0x38,0xf1,0x44,0x24,60 leal 4(%r8),%eax .byte 102,15,56,220,216 - xorl %r11d,%eax + xorl %ebp,%eax .byte 0x0f,0x38,0xf1,0x44,0x24,76 .byte 102,15,56,220,224 leal 5(%r8),%eax - xorl %r11d,%eax + xorl %ebp,%eax .byte 102,15,56,220,232 .byte 0x0f,0x38,0xf1,0x44,0x24,92 movq %r10,%rax @@ -1209,7 +1209,7 @@ L$ctr32_loop8: bswapl %r9d movups 32-128(%rcx),%xmm0 .byte 102,15,56,220,225 - xorl %r11d,%r9d + xorl %ebp,%r9d nop .byte 102,15,56,220,233 movl %r9d,0+12(%rsp) @@ -1222,7 +1222,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1236,7 +1236,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1250,7 +1250,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1264,7 +1264,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1278,7 +1278,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,208 .byte 102,15,56,220,216 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,224 .byte 102,15,56,220,232 @@ -1292,7 +1292,7 @@ L$ctr32_loop8: bswapl %r9d .byte 102,15,56,220,209 .byte 102,15,56,220,217 - xorl %r11d,%r9d + xorl %ebp,%r9d .byte 0x66,0x90 .byte 102,15,56,220,225 .byte 102,15,56,220,233 @@ -1307,7 +1307,7 @@ L$ctr32_loop8: .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 - xorl %r11d,%r9d + xorl %ebp,%r9d movdqu 0(%rdi),%xmm10 .byte 102,15,56,220,232 movl %r9d,112+12(%rsp) @@ -1542,7 +1542,7 @@ L$ctr32_loop3: L$ctr32_done: xorps %xmm0,%xmm0 - xorl %r11d,%r11d + xorl %ebp,%ebp pxor %xmm1,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 @@ -1566,8 +1566,8 @@ L$ctr32_done: pxor %xmm14,%xmm14 movaps %xmm0,112(%rsp) pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp L$ctr32_epilogue: .byte 0xf3,0xc3 @@ -1576,11 +1576,10 @@ L$ctr32_epilogue: .p2align 4 _aesni_xts_encrypt: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -1596,7 +1595,7 @@ L$oop_enc1_8: jnz L$oop_enc1_8 .byte 102,15,56,221,209 movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -1652,9 +1651,9 @@ L$oop_enc1_8: jc L$xts_enc_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq L$xts_magic(%rip),%r8 jmp L$xts_enc_grandloop @@ -1679,7 +1678,7 @@ L$xts_enc_grandloop: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,220,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -1688,7 +1687,7 @@ L$xts_enc_grandloop: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,220,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,220,208 @@ -1703,7 +1702,7 @@ L$xts_enc_grandloop: movdqa %xmm14,64(%rsp) .byte 102,15,56,220,240 .byte 102,15,56,220,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_enc_loop6 @@ -1735,7 +1734,7 @@ L$xts_enc_loop6: psrad $31,%xmm14 .byte 102,15,56,220,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,220,225 .byte 102,15,56,220,233 .byte 102,15,56,220,241 @@ -1803,10 +1802,10 @@ L$xts_enc_loop6: .byte 102,15,56,220,225 .byte 102,15,56,220,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,220,241 .byte 102,15,56,220,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,221,84,36,0 @@ -1833,7 +1832,7 @@ L$xts_enc_loop6: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax L$xts_enc_short: @@ -1989,7 +1988,7 @@ L$xts_enc_steal: jnz L$xts_enc_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups -16(%rsi),%xmm2 @@ -2032,8 +2031,8 @@ L$xts_enc_ret: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp L$xts_enc_epilogue: .byte 0xf3,0xc3 @@ -2042,11 +2041,10 @@ L$xts_enc_epilogue: .p2align 4 _aesni_xts_decrypt: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $112,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp movups (%r9),%xmm2 movl 240(%r8),%eax movl 240(%rcx),%r10d @@ -2068,7 +2066,7 @@ L$oop_enc1_11: subq %rax,%rdx movups (%rcx),%xmm0 - movq %rcx,%r11 + movq %rcx,%rbp movl %r10d,%eax shll $4,%r10d movq %rdx,%r9 @@ -2124,9 +2122,9 @@ L$oop_enc1_11: jc L$xts_dec_short movl $16+96,%eax - leaq 32(%r11,%r10,1),%rcx + leaq 32(%rbp,%r10,1),%rcx subq %r10,%rax - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 movq %rax,%r10 leaq L$xts_magic(%rip),%r8 jmp L$xts_dec_grandloop @@ -2151,7 +2149,7 @@ L$xts_dec_grandloop: movdqa 96(%rsp),%xmm9 pxor %xmm14,%xmm6 .byte 102,15,56,222,233 - movups 32(%r11),%xmm0 + movups 32(%rbp),%xmm0 leaq 96(%rdi),%rdi pxor %xmm8,%xmm7 @@ -2160,7 +2158,7 @@ L$xts_dec_grandloop: pxor %xmm9,%xmm11 movdqa %xmm10,0(%rsp) .byte 102,15,56,222,249 - movups 48(%r11),%xmm1 + movups 48(%rbp),%xmm1 pxor %xmm9,%xmm12 .byte 102,15,56,222,208 @@ -2175,7 +2173,7 @@ L$xts_dec_grandloop: movdqa %xmm14,64(%rsp) .byte 102,15,56,222,240 .byte 102,15,56,222,248 - movups 64(%r11),%xmm0 + movups 64(%rbp),%xmm0 movdqa %xmm8,80(%rsp) pshufd $0x5f,%xmm15,%xmm9 jmp L$xts_dec_loop6 @@ -2207,7 +2205,7 @@ L$xts_dec_loop6: psrad $31,%xmm14 .byte 102,15,56,222,217 pand %xmm8,%xmm14 - movups (%r11),%xmm10 + movups (%rbp),%xmm10 .byte 102,15,56,222,225 .byte 102,15,56,222,233 .byte 102,15,56,222,241 @@ -2275,10 +2273,10 @@ L$xts_dec_loop6: .byte 102,15,56,222,225 .byte 102,15,56,222,233 pxor %xmm0,%xmm15 - movups (%r11),%xmm0 + movups (%rbp),%xmm0 .byte 102,15,56,222,241 .byte 102,15,56,222,249 - movups 16(%r11),%xmm1 + movups 16(%rbp),%xmm1 pxor %xmm15,%xmm14 .byte 102,15,56,223,84,36,0 @@ -2305,7 +2303,7 @@ L$xts_dec_loop6: movl $16+96,%eax subl %r10d,%eax - movq %r11,%rcx + movq %rbp,%rcx shrl $4,%eax L$xts_dec_short: @@ -2462,7 +2460,7 @@ L$xts_dec_done: jz L$xts_dec_ret L$xts_dec_done2: movq %r9,%rdx - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rdi),%xmm2 @@ -2492,7 +2490,7 @@ L$xts_dec_steal: jnz L$xts_dec_steal subq %r9,%rsi - movq %r11,%rcx + movq %rbp,%rcx movl %r10d,%eax movups (%rsi),%xmm2 @@ -2535,11 +2533,827 @@ L$xts_dec_ret: movaps %xmm0,96(%rsp) pxor %xmm14,%xmm14 pxor %xmm15,%xmm15 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp L$xts_dec_epilogue: .byte 0xf3,0xc3 +.globl _aesni_ocb_encrypt +.private_extern _aesni_ocb_encrypt + +.p2align 5 +_aesni_ocb_encrypt: + leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz L$ocb_enc_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + leaq 16(%rsi),%rsi + subq $1,%rdx + jz L$ocb_enc_done + +L$ocb_enc_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc L$ocb_enc_short + jmp L$ocb_enc_grandloop + +.p2align 5 +L$ocb_enc_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_encrypt6 + + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc L$ocb_enc_grandloop + +L$ocb_enc_short: + addq $6,%rdx + jz L$ocb_enc_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb L$ocb_enc_one + movdqu 16(%rdi),%xmm3 + je L$ocb_enc_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb L$ocb_enc_three + movdqu 48(%rdi),%xmm5 + je L$ocb_enc_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_encrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_one: + movdqa %xmm10,%xmm7 + + call __ocb_encrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_three: + pxor %xmm5,%xmm5 + + call __ocb_encrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + + jmp L$ocb_enc_done + +.p2align 4 +L$ocb_enc_four: + call __ocb_encrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + +L$ocb_enc_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +L$ocb_enc_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_encrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm6,%xmm8 + pxor %xmm14,%xmm6 + pxor %xmm7,%xmm8 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,220,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,220,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp L$ocb_enc_loop6 + +.p2align 5 +L$ocb_enc_loop6: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 +.byte 102,15,56,220,240 +.byte 102,15,56,220,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_enc_loop6 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 +.byte 102,15,56,220,241 +.byte 102,15,56,220,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,221,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 +.byte 102,65,15,56,221,246 +.byte 102,65,15,56,221,255 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_encrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm2,%xmm8 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm3,%xmm8 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm4,%xmm8 + pxor %xmm12,%xmm4 + pxor %xmm5,%xmm8 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups 64(%r11),%xmm0 + jmp L$ocb_enc_loop4 + +.p2align 5 +L$ocb_enc_loop4: +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 +.byte 102,15,56,220,216 +.byte 102,15,56,220,224 +.byte 102,15,56,220,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_enc_loop4 + +.byte 102,15,56,220,209 +.byte 102,15,56,220,217 +.byte 102,15,56,220,225 +.byte 102,15,56,220,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,221,210 +.byte 102,65,15,56,221,219 +.byte 102,65,15,56,221,228 +.byte 102,65,15,56,221,237 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_encrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm2,%xmm8 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,220,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,220,208 + movups 64(%r11),%xmm0 + jmp L$ocb_enc_loop1 + +.p2align 5 +L$ocb_enc_loop1: +.byte 102,15,56,220,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,220,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_enc_loop1 + +.byte 102,15,56,220,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,221,215 + .byte 0xf3,0xc3 + + +.globl _aesni_ocb_decrypt +.private_extern _aesni_ocb_decrypt + +.p2align 5 +_aesni_ocb_decrypt: + leaq (%rsp),%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + movq 8(%rax),%rbx + movq 8+8(%rax),%rbp + + movl 240(%rcx),%r10d + movq %rcx,%r11 + shll $4,%r10d + movups (%rcx),%xmm9 + movups 16(%rcx,%r10,1),%xmm1 + + movdqu (%r9),%xmm15 + pxor %xmm1,%xmm9 + pxor %xmm1,%xmm15 + + movl $16+32,%eax + leaq 32(%r11,%r10,1),%rcx + movups 16(%r11),%xmm1 + subq %r10,%rax + movq %rax,%r10 + + movdqu (%rbx),%xmm10 + movdqu (%rbp),%xmm8 + + testq $1,%r8 + jnz L$ocb_dec_odd + + bsfq %r8,%r12 + addq $1,%r8 + shlq $4,%r12 + movdqu (%rbx,%r12,1),%xmm7 + movdqu (%rdi),%xmm2 + leaq 16(%rdi),%rdi + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm8 + leaq 16(%rsi),%rsi + subq $1,%rdx + jz L$ocb_dec_done + +L$ocb_dec_odd: + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + leaq 6(%r8),%r8 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + shlq $4,%r12 + shlq $4,%r13 + shlq $4,%r14 + + subq $6,%rdx + jc L$ocb_dec_short + jmp L$ocb_dec_grandloop + +.p2align 5 +L$ocb_dec_grandloop: + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + leaq 96(%rdi),%rdi + + call __ocb_decrypt6 + + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm8 + leaq 96(%rsi),%rsi + subq $6,%rdx + jnc L$ocb_dec_grandloop + +L$ocb_dec_short: + addq $6,%rdx + jz L$ocb_dec_done + + movdqu 0(%rdi),%xmm2 + cmpq $2,%rdx + jb L$ocb_dec_one + movdqu 16(%rdi),%xmm3 + je L$ocb_dec_two + + movdqu 32(%rdi),%xmm4 + cmpq $4,%rdx + jb L$ocb_dec_three + movdqu 48(%rdi),%xmm5 + je L$ocb_dec_four + + movdqu 64(%rdi),%xmm6 + pxor %xmm7,%xmm7 + + call __ocb_decrypt6 + + movdqa %xmm14,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm8 + + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_one: + movdqa %xmm10,%xmm7 + + call __ocb_decrypt1 + + movdqa %xmm7,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_two: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm11,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_three: + pxor %xmm5,%xmm5 + + call __ocb_decrypt4 + + movdqa %xmm12,%xmm15 + movups %xmm2,0(%rsi) + xorps %xmm2,%xmm8 + movups %xmm3,16(%rsi) + xorps %xmm3,%xmm8 + movups %xmm4,32(%rsi) + xorps %xmm4,%xmm8 + + jmp L$ocb_dec_done + +.p2align 4 +L$ocb_dec_four: + call __ocb_decrypt4 + + movdqa %xmm13,%xmm15 + movups %xmm2,0(%rsi) + pxor %xmm2,%xmm8 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm8 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm8 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm8 + +L$ocb_dec_done: + pxor %xmm0,%xmm15 + movdqu %xmm8,(%rbp) + movdqu %xmm15,(%r9) + + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + pxor %xmm10,%xmm10 + pxor %xmm11,%xmm11 + pxor %xmm12,%xmm12 + pxor %xmm13,%xmm13 + pxor %xmm14,%xmm14 + pxor %xmm15,%xmm15 + leaq 40(%rsp),%rax + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbp + movq -8(%rax),%rbx + leaq (%rax),%rsp +L$ocb_dec_epilogue: + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_decrypt6: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + movdqa %xmm10,%xmm14 + pxor %xmm15,%xmm10 + movdqu (%rbx,%r14,1),%xmm15 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm14 + pxor %xmm13,%xmm5 + pxor %xmm14,%xmm15 + pxor %xmm14,%xmm6 + pxor %xmm15,%xmm7 + movups 32(%r11),%xmm0 + + leaq 1(%r8),%r12 + leaq 3(%r8),%r13 + leaq 5(%r8),%r14 + addq $6,%r8 + pxor %xmm9,%xmm10 + bsfq %r12,%r12 + bsfq %r13,%r13 + bsfq %r14,%r14 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 +.byte 102,15,56,222,241 + pxor %xmm9,%xmm13 + pxor %xmm9,%xmm14 +.byte 102,15,56,222,249 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm15 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups 64(%r11),%xmm0 + shlq $4,%r12 + shlq $4,%r13 + jmp L$ocb_dec_loop6 + +.p2align 5 +L$ocb_dec_loop6: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 +.byte 102,15,56,222,240 +.byte 102,15,56,222,248 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_dec_loop6 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 +.byte 102,15,56,222,241 +.byte 102,15,56,222,249 + movups 16(%r11),%xmm1 + shlq $4,%r14 + +.byte 102,65,15,56,223,210 + movdqu (%rbx),%xmm10 + movq %r10,%rax +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 +.byte 102,65,15,56,223,246 +.byte 102,65,15,56,223,255 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_decrypt4: + pxor %xmm9,%xmm15 + movdqu (%rbx,%r12,1),%xmm11 + movdqa %xmm10,%xmm12 + movdqu (%rbx,%r13,1),%xmm13 + pxor %xmm15,%xmm10 + pxor %xmm10,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm12 + pxor %xmm11,%xmm3 + pxor %xmm12,%xmm13 + pxor %xmm12,%xmm4 + pxor %xmm13,%xmm5 + movups 32(%r11),%xmm0 + + pxor %xmm9,%xmm10 + pxor %xmm9,%xmm11 + pxor %xmm9,%xmm12 + pxor %xmm9,%xmm13 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 48(%r11),%xmm1 + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups 64(%r11),%xmm0 + jmp L$ocb_dec_loop4 + +.p2align 5 +L$ocb_dec_loop4: +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 +.byte 102,15,56,222,216 +.byte 102,15,56,222,224 +.byte 102,15,56,222,232 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_dec_loop4 + +.byte 102,15,56,222,209 +.byte 102,15,56,222,217 +.byte 102,15,56,222,225 +.byte 102,15,56,222,233 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,65,15,56,223,210 +.byte 102,65,15,56,223,219 +.byte 102,65,15,56,223,228 +.byte 102,65,15,56,223,237 + .byte 0xf3,0xc3 + + + +.p2align 5 +__ocb_decrypt1: + pxor %xmm15,%xmm7 + pxor %xmm9,%xmm7 + pxor %xmm7,%xmm2 + movups 32(%r11),%xmm0 + +.byte 102,15,56,222,209 + movups 48(%r11),%xmm1 + pxor %xmm9,%xmm7 + +.byte 102,15,56,222,208 + movups 64(%r11),%xmm0 + jmp L$ocb_dec_loop1 + +.p2align 5 +L$ocb_dec_loop1: +.byte 102,15,56,222,209 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + +.byte 102,15,56,222,208 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$ocb_dec_loop1 + +.byte 102,15,56,222,209 + movups 16(%r11),%xmm1 + movq %r10,%rax + +.byte 102,15,56,223,215 + .byte 0xf3,0xc3 + .globl _aesni_cbc_encrypt .private_extern _aesni_cbc_encrypt @@ -2637,11 +3451,11 @@ L$oop_dec1_16: jmp L$cbc_ret .p2align 4 L$cbc_decrypt_bulk: - leaq (%rsp),%rax + leaq (%rsp),%r11 pushq %rbp subq $16,%rsp andq $-16,%rsp - leaq -8(%rax),%rbp + movq %rcx,%rbp movups (%r8),%xmm10 movl %r10d,%eax cmpq $0x50,%rdx @@ -2659,7 +3473,8 @@ L$cbc_decrypt_bulk: movdqa %xmm5,%xmm14 movdqu 80(%rdi),%xmm7 movdqa %xmm6,%xmm15 - movl _OPENSSL_ia32cap_P+4(%rip),%r9d + leaq _OPENSSL_ia32cap_P(%rip),%r9 + movl 4(%r9),%r9d cmpq $0x70,%rdx jbe L$cbc_dec_six_or_seven @@ -2681,7 +3496,7 @@ L$cbc_dec_loop8_enter: pxor %xmm0,%xmm3 movups 16-112(%rcx),%xmm1 pxor %xmm0,%xmm4 - xorq %r11,%r11 + movq $-1,%rbp cmpq $0x70,%rdx pxor %xmm0,%xmm5 pxor %xmm0,%xmm6 @@ -2697,10 +3512,10 @@ L$cbc_dec_loop8_enter: .byte 102,15,56,222,241 .byte 102,15,56,222,249 .byte 102,68,15,56,222,193 - setnc %r11b - shlq $7,%r11 + adcq $0,%rbp + andq $128,%rbp .byte 102,68,15,56,222,201 - addq %rdi,%r11 + addq %rdi,%rbp movups 48-112(%rcx),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 @@ -2838,18 +3653,18 @@ L$cbc_dec_done: movdqu 112(%rdi),%xmm0 .byte 102,65,15,56,223,228 leaq 128(%rdi),%rdi - movdqu 0(%r11),%xmm11 + movdqu 0(%rbp),%xmm11 .byte 102,65,15,56,223,237 .byte 102,65,15,56,223,246 - movdqu 16(%r11),%xmm12 - movdqu 32(%r11),%xmm13 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 .byte 102,65,15,56,223,255 .byte 102,68,15,56,223,193 - movdqu 48(%r11),%xmm14 - movdqu 64(%r11),%xmm15 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 .byte 102,69,15,56,223,202 movdqa %xmm0,%xmm10 - movdqu 80(%r11),%xmm1 + movdqu 80(%rbp),%xmm1 movups -112(%rcx),%xmm0 movups %xmm2,(%rsi) @@ -2968,7 +3783,7 @@ L$cbc_dec_loop6_enter: pxor %xmm13,%xmm5 movdqu %xmm4,32(%rsi) pxor %xmm14,%xmm6 - movq %r11,%rcx + movq %rbp,%rcx movdqu %xmm5,48(%rsi) pxor %xmm15,%xmm7 movl %r10d,%eax @@ -3121,8 +3936,8 @@ L$cbc_dec_tail_partial: L$cbc_dec_ret: xorps %xmm0,%xmm0 pxor %xmm1,%xmm1 - leaq (%rbp),%rsp - popq %rbp + movq -8(%r11),%rbp + leaq (%r11),%rsp L$cbc_ret: .byte 0xf3,0xc3 @@ -3180,10 +3995,11 @@ __aesni_set_encrypt_key: testq %rdx,%rdx jz L$enc_key_ret - movl $268437504,%r10d movups (%rdi),%xmm0 xorps %xmm4,%xmm4 - andl _OPENSSL_ia32cap_P+4(%rip),%r10d + leaq _OPENSSL_ia32cap_P(%rip),%r10 + movl 4(%r10),%r10d + andl $268437504,%r10d leaq 16(%rdx),%rax cmpl $256,%esi je L$14rounds diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/aes/bsaes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S similarity index 98% rename from packager/third_party/boringssl/mac-x86_64/crypto/aes/bsaes-x86_64.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S index ad802e3d5d..195abd3b5c 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/aes/bsaes-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/bsaes-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text @@ -1302,15 +1302,14 @@ L$cbc_dec_bzero: cmpq %rax,%rbp ja L$cbc_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp L$cbc_dec_epilogue: .byte 0xf3,0xc3 @@ -1503,15 +1502,14 @@ L$ctr_enc_bzero: cmpq %rax,%rbp ja L$ctr_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp L$ctr_enc_epilogue: .byte 0xf3,0xc3 @@ -1955,15 +1953,14 @@ L$xts_enc_bzero: cmpq %rax,%rbp ja L$xts_enc_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp L$xts_enc_epilogue: .byte 0xf3,0xc3 @@ -2434,15 +2431,14 @@ L$xts_dec_bzero: cmpq %rax,%rbp ja L$xts_dec_bzero - leaq (%rbp),%rsp - movq 72(%rsp),%r15 - movq 80(%rsp),%r14 - movq 88(%rsp),%r13 - movq 96(%rsp),%r12 - movq 104(%rsp),%rbx - movq 112(%rsp),%rax - leaq 120(%rsp),%rsp - movq %rax,%rbp + leaq 120(%rbp),%rax + movq -48(%rax),%r15 + movq -40(%rax),%r14 + movq -32(%rax),%r13 + movq -24(%rax),%r12 + movq -16(%rax),%rbx + movq -8(%rax),%rbp + leaq (%rax),%rsp L$xts_dec_epilogue: .byte 0xf3,0xc3 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/modes/ghash-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S similarity index 67% rename from packager/third_party/boringssl/mac-x86_64/crypto/modes/ghash-x86_64.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S index 1072c7fcd3..78b88cc28d 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/modes/ghash-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/ghash-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text @@ -10,6 +10,10 @@ _gcm_gmult_4bit: pushq %rbx pushq %rbp pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + subq $280,%rsp L$gmult_prologue: movzbq 15(%rdi),%r8 @@ -86,8 +90,9 @@ L$break1: movq %r8,8(%rdi) movq %r9,(%rdi) - movq 16(%rsp),%rbx - leaq 24(%rsp),%rsp + leaq 280+48(%rsp),%rsi + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$gmult_epilogue: .byte 0xf3,0xc3 @@ -647,14 +652,14 @@ L$outer_loop: movq %r8,8(%rdi) movq %r9,(%rdi) - leaq 280(%rsp),%rsi - movq 0(%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + leaq 280+48(%rsp),%rsi + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq 0(%rsi),%rsp L$ghash_epilogue: .byte 0xf3,0xc3 @@ -884,7 +889,8 @@ L$_ghash_clmul: jz L$odd_tail movdqu 16(%rsi),%xmm6 - movl _OPENSSL_ia32cap_P+4(%rip),%eax + leaq _OPENSSL_ia32cap_P(%rip),%rax + movl 4(%rax),%eax cmpq $0x30,%rcx jb L$skip4x @@ -1256,7 +1262,108 @@ L$done: .p2align 5 _gcm_init_avx: - jmp L$_init_clmul + vzeroupper + + vmovdqu (%rsi),%xmm2 + vpshufd $78,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp L$init_start_avx +.p2align 5 +L$init_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +L$init_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz L$init_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + .byte 0xf3,0xc3 .globl _gcm_gmult_avx .private_extern _gcm_gmult_avx @@ -1270,7 +1377,377 @@ _gcm_gmult_avx: .p2align 5 _gcm_ghash_avx: - jmp L$_ghash_clmul + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq L$0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu L$bswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb L$short_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb L$tail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp L$oop8x_avx + +.p2align 5 +L$oop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc L$oop8x_avx + + addq $0x80,%rcx + jmp L$tail_no_xor_avx + +.p2align 5 +L$short_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp L$tail_avx + +.p2align 5 +L$tail_avx: + vpxor %xmm10,%xmm15,%xmm15 +L$tail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne L$short_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + .byte 0xf3,0xc3 .p2align 6 L$bswap_mask: diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/md5/md5-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S similarity index 99% rename from packager/third_party/boringssl/mac-x86_64/crypto/md5/md5-x86_64.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S index 16fd2ccef8..776c116046 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/md5/md5-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/md5-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text .p2align 4 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/ec/p256-x86_64-asm.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S similarity index 89% rename from packager/third_party/boringssl/mac-x86_64/crypto/ec/p256-x86_64-asm.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S index 1cd0cc3f5c..f7875772ad 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/ec/p256-x86_64-asm.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/p256-x86_64-asm.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text @@ -17,47 +17,6 @@ L$ONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe -.p2align 6 -ecp_nistz256_mul_by_2: - pushq %r12 - pushq %r13 - - movq 0(%rsi),%r8 - movq 8(%rsi),%r9 - addq %r8,%r8 - movq 16(%rsi),%r10 - adcq %r9,%r9 - movq 24(%rsi),%r11 - leaq L$poly(%rip),%rsi - movq %r8,%rax - adcq %r10,%r10 - adcq %r11,%r11 - movq %r9,%rdx - sbbq %r13,%r13 - - subq 0(%rsi),%r8 - movq %r10,%rcx - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - movq %r11,%r12 - sbbq 24(%rsi),%r11 - testq %r13,%r13 - - cmovzq %rax,%r8 - cmovzq %rdx,%r9 - movq %r8,0(%rdi) - cmovzq %rcx,%r10 - movq %r9,8(%rdi) - cmovzq %r12,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - popq %r13 - popq %r12 - .byte 0xf3,0xc3 - - - .globl _ecp_nistz256_neg .private_extern _ecp_nistz256_neg @@ -552,106 +511,15 @@ __ecp_nistz256_sqr_montq: - - - - -.globl _ecp_nistz256_from_mont -.private_extern _ecp_nistz256_from_mont - -.p2align 5 -_ecp_nistz256_from_mont: - pushq %r12 - pushq %r13 - - movq 0(%rsi),%rax - movq L$poly+24(%rip),%r13 - movq 8(%rsi),%r9 - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - movq %rax,%r8 - movq L$poly+8(%rip),%r12 - - - - movq %rax,%rcx - shlq $32,%r8 - mulq %r13 - shrq $32,%rcx - addq %r8,%r9 - adcq %rcx,%r10 - adcq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - - - - movq %r9,%rcx - shlq $32,%r9 - movq %rdx,%r8 - mulq %r13 - shrq $32,%rcx - addq %r9,%r10 - adcq %rcx,%r11 - adcq %rax,%r8 - movq %r10,%rax - adcq $0,%rdx - - - - movq %r10,%rcx - shlq $32,%r10 - movq %rdx,%r9 - mulq %r13 - shrq $32,%rcx - addq %r10,%r11 - adcq %rcx,%r8 - adcq %rax,%r9 - movq %r11,%rax - adcq $0,%rdx - - - - movq %r11,%rcx - shlq $32,%r11 - movq %rdx,%r10 - mulq %r13 - shrq $32,%rcx - addq %r11,%r8 - adcq %rcx,%r9 - movq %r8,%rcx - adcq %rax,%r10 - movq %r9,%rsi - adcq $0,%rdx - - subq $-1,%r8 - movq %r10,%rax - sbbq %r12,%r9 - sbbq $0,%r10 - movq %rdx,%r11 - sbbq %r13,%rdx - sbbq %r13,%r13 - - cmovnzq %rcx,%r8 - cmovnzq %rsi,%r9 - movq %r8,0(%rdi) - cmovnzq %rax,%r10 - movq %r9,8(%rdi) - cmovzq %rdx,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - popq %r13 - popq %r12 - .byte 0xf3,0xc3 - - - .globl _ecp_nistz256_select_w5 .private_extern _ecp_nistz256_select_w5 .p2align 5 _ecp_nistz256_select_w5: + leaq _OPENSSL_ia32cap_P(%rip),%rax + movq 8(%rax),%rax + testl $32,%eax + jnz L$avx2_select_w5 movdqa L$One(%rip),%xmm0 movd %edx,%xmm1 @@ -712,6 +580,10 @@ L$select_loop_sse_w5: .p2align 5 _ecp_nistz256_select_w7: + leaq _OPENSSL_ia32cap_P(%rip),%rax + movq 8(%rax),%rax + testl $32,%eax + jnz L$avx2_select_w7 movdqa L$One(%rip),%xmm8 movd %edx,%xmm1 @@ -753,24 +625,155 @@ L$select_loop_sse_w7: movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 + + + +.p2align 5 +ecp_nistz256_avx2_select_w5: +L$avx2_select_w5: + vzeroupper + vmovdqa L$Two(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + vpxor %ymm4,%ymm4,%ymm4 + + vmovdqa L$One(%rip),%ymm5 + vmovdqa L$Two(%rip),%ymm10 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + movq $8,%rax +L$select_loop_avx2_w5: + + vmovdqa 0(%rsi),%ymm6 + vmovdqa 32(%rsi),%ymm7 + vmovdqa 64(%rsi),%ymm8 + + vmovdqa 96(%rsi),%ymm11 + vmovdqa 128(%rsi),%ymm12 + vmovdqa 160(%rsi),%ymm13 + + vpcmpeqd %ymm1,%ymm5,%ymm9 + vpcmpeqd %ymm1,%ymm10,%ymm14 + + vpaddd %ymm0,%ymm5,%ymm5 + vpaddd %ymm0,%ymm10,%ymm10 + leaq 192(%rsi),%rsi + + vpand %ymm9,%ymm6,%ymm6 + vpand %ymm9,%ymm7,%ymm7 + vpand %ymm9,%ymm8,%ymm8 + vpand %ymm14,%ymm11,%ymm11 + vpand %ymm14,%ymm12,%ymm12 + vpand %ymm14,%ymm13,%ymm13 + + vpxor %ymm6,%ymm2,%ymm2 + vpxor %ymm7,%ymm3,%ymm3 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm11,%ymm2,%ymm2 + vpxor %ymm12,%ymm3,%ymm3 + vpxor %ymm13,%ymm4,%ymm4 + + decq %rax + jnz L$select_loop_avx2_w5 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,64(%rdi) + vzeroupper + .byte 0xf3,0xc3 + + + + .globl _ecp_nistz256_avx2_select_w7 .private_extern _ecp_nistz256_avx2_select_w7 .p2align 5 _ecp_nistz256_avx2_select_w7: -.byte 0x0f,0x0b +L$avx2_select_w7: + vzeroupper + vmovdqa L$Three(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + + vmovdqa L$One(%rip),%ymm4 + vmovdqa L$Two(%rip),%ymm8 + vmovdqa L$Three(%rip),%ymm12 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + + movq $21,%rax +L$select_loop_avx2_w7: + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vmovdqa 64(%rsi),%ymm9 + vmovdqa 96(%rsi),%ymm10 + + vmovdqa 128(%rsi),%ymm13 + vmovdqa 160(%rsi),%ymm14 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + vpcmpeqd %ymm1,%ymm8,%ymm11 + vpcmpeqd %ymm1,%ymm12,%ymm15 + + vpaddd %ymm0,%ymm4,%ymm4 + vpaddd %ymm0,%ymm8,%ymm8 + vpaddd %ymm0,%ymm12,%ymm12 + leaq 192(%rsi),%rsi + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm11,%ymm9,%ymm9 + vpand %ymm11,%ymm10,%ymm10 + vpand %ymm15,%ymm13,%ymm13 + vpand %ymm15,%ymm14,%ymm14 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm13,%ymm2,%ymm2 + vpxor %ymm14,%ymm3,%ymm3 + + decq %rax + jnz L$select_loop_avx2_w7 + + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vzeroupper .byte 0xf3,0xc3 .p2align 5 __ecp_nistz256_add_toq: + xorq %r11,%r11 addq 0(%rbx),%r12 adcq 8(%rbx),%r13 movq %r12,%rax adcq 16(%rbx),%r8 adcq 24(%rbx),%r9 movq %r13,%rbp - sbbq %r11,%r11 + adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx @@ -778,14 +781,14 @@ __ecp_nistz256_add_toq: sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 - testq %r11,%r11 + sbbq $0,%r11 - cmovzq %rax,%r12 - cmovzq %rbp,%r13 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 movq %r12,0(%rdi) - cmovzq %rcx,%r8 + cmovcq %rcx,%r8 movq %r13,8(%rdi) - cmovzq %r10,%r9 + cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) @@ -853,13 +856,14 @@ __ecp_nistz256_subq: .p2align 5 __ecp_nistz256_mul_by_2q: + xorq %r11,%r11 addq %r12,%r12 adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp - sbbq %r11,%r11 + adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx @@ -867,14 +871,14 @@ __ecp_nistz256_mul_by_2q: sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 - testq %r11,%r11 + sbbq $0,%r11 - cmovzq %rax,%r12 - cmovzq %rbp,%r13 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 movq %r12,0(%rdi) - cmovzq %rcx,%r8 + cmovcq %rcx,%r8 movq %r13,8(%rdi) - cmovzq %r10,%r9 + cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) @@ -1106,16 +1110,14 @@ _ecp_nistz256_point_add: movq %rdx,%rsi movdqa %xmm0,384(%rsp) movdqa %xmm1,384+16(%rsp) - por %xmm0,%xmm1 movdqa %xmm2,416(%rsp) movdqa %xmm3,416+16(%rsp) - por %xmm2,%xmm3 movdqa %xmm4,448(%rsp) movdqa %xmm5,448+16(%rsp) - por %xmm1,%xmm3 + por %xmm4,%xmm5 movdqu 0(%rsi),%xmm0 - pshufd $0xb1,%xmm3,%xmm5 + pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 @@ -1127,14 +1129,14 @@ _ecp_nistz256_point_add: movdqa %xmm0,480(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 movdqa %xmm2,512(%rsp) movdqa %xmm3,512+16(%rsp) - por %xmm2,%xmm3 por %xmm4,%xmm5 pxor %xmm4,%xmm4 - por %xmm1,%xmm3 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 leaq 64-0(%rsi),%rsi movq %rax,544+0(%rsp) @@ -1145,8 +1147,8 @@ _ecp_nistz256_point_add: call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm3,%xmm4 - por %xmm3,%xmm4 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 @@ -1329,6 +1331,7 @@ L$add_proceedq: + xorq %r11,%r11 addq %r12,%r12 leaq 96(%rsp),%rsi adcq %r13,%r13 @@ -1336,7 +1339,7 @@ L$add_proceedq: adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp - sbbq %r11,%r11 + adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx @@ -1344,15 +1347,15 @@ L$add_proceedq: sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 - testq %r11,%r11 + sbbq $0,%r11 - cmovzq %rax,%r12 + cmovcq %rax,%r12 movq 0(%rsi),%rax - cmovzq %rbp,%r13 + cmovcq %rbp,%r13 movq 8(%rsi),%rbp - cmovzq %rcx,%r8 + cmovcq %rcx,%r8 movq 16(%rsi),%rcx - cmovzq %r10,%r9 + cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subq @@ -1507,16 +1510,14 @@ _ecp_nistz256_point_add_affine: movq 64+24(%rsi),%r8 movdqa %xmm0,320(%rsp) movdqa %xmm1,320+16(%rsp) - por %xmm0,%xmm1 movdqa %xmm2,352(%rsp) movdqa %xmm3,352+16(%rsp) - por %xmm2,%xmm3 movdqa %xmm4,384(%rsp) movdqa %xmm5,384+16(%rsp) - por %xmm1,%xmm3 + por %xmm4,%xmm5 movdqu 0(%rbx),%xmm0 - pshufd $0xb1,%xmm3,%xmm5 + pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 @@ -1634,6 +1635,7 @@ _ecp_nistz256_point_add_affine: + xorq %r11,%r11 addq %r12,%r12 leaq 192(%rsp),%rsi adcq %r13,%r13 @@ -1641,7 +1643,7 @@ _ecp_nistz256_point_add_affine: adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp - sbbq %r11,%r11 + adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx @@ -1649,15 +1651,15 @@ _ecp_nistz256_point_add_affine: sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 - testq %r11,%r11 + sbbq $0,%r11 - cmovzq %rax,%r12 + cmovcq %rax,%r12 movq 0(%rsi),%rax - cmovzq %rbp,%r13 + cmovcq %rbp,%r13 movq 8(%rsi),%rbp - cmovzq %rcx,%r8 + cmovcq %rcx,%r8 movq 16(%rsi),%rcx - cmovzq %r10,%r9 + cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subq diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/rand/rdrand-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S similarity index 91% rename from packager/third_party/boringssl/mac-x86_64/crypto/rand/rdrand-x86_64.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S index f0df296e1a..b259286f6e 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/rand/rdrand-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rdrand-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S new file mode 100644 index 0000000000..6eb7afc510 --- /dev/null +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/rsaz-avx2.S @@ -0,0 +1,1742 @@ +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) +.text + +.globl _rsaz_1024_sqr_avx2 +.private_extern _rsaz_1024_sqr_avx2 + +.p2align 6 +_rsaz_1024_sqr_avx2: + + leaq (%rsp),%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + vzeroupper + movq %rax,%rbp + + movq %rdx,%r13 + subq $832,%rsp + movq %r13,%r15 + subq $-128,%rdi + subq $-128,%rsi + subq $-128,%r13 + + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + vpxor %ymm9,%ymm9,%ymm9 + jz L$sqr_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%r13),%ymm0 + andq $-2048,%rsp + vmovdqu 32-128(%r13),%ymm1 + vmovdqu 64-128(%r13),%ymm2 + vmovdqu 96-128(%r13),%ymm3 + vmovdqu 128-128(%r13),%ymm4 + vmovdqu 160-128(%r13),%ymm5 + vmovdqu 192-128(%r13),%ymm6 + vmovdqu 224-128(%r13),%ymm7 + vmovdqu 256-128(%r13),%ymm8 + leaq 832+128(%rsp),%r13 + vmovdqu %ymm0,0-128(%r13) + vmovdqu %ymm1,32-128(%r13) + vmovdqu %ymm2,64-128(%r13) + vmovdqu %ymm3,96-128(%r13) + vmovdqu %ymm4,128-128(%r13) + vmovdqu %ymm5,160-128(%r13) + vmovdqu %ymm6,192-128(%r13) + vmovdqu %ymm7,224-128(%r13) + vmovdqu %ymm8,256-128(%r13) + vmovdqu %ymm9,288-128(%r13) + +L$sqr_1024_no_n_copy: + andq $-1024,%rsp + + vmovdqu 32-128(%rsi),%ymm1 + vmovdqu 64-128(%rsi),%ymm2 + vmovdqu 96-128(%rsi),%ymm3 + vmovdqu 128-128(%rsi),%ymm4 + vmovdqu 160-128(%rsi),%ymm5 + vmovdqu 192-128(%rsi),%ymm6 + vmovdqu 224-128(%rsi),%ymm7 + vmovdqu 256-128(%rsi),%ymm8 + + leaq 192(%rsp),%rbx + vpbroadcastq L$and_mask(%rip),%ymm15 + jmp L$OOP_GRANDE_SQR_1024 + +.p2align 5 +L$OOP_GRANDE_SQR_1024: + leaq 576+128(%rsp),%r9 + leaq 448(%rsp),%r12 + + + + + vpaddq %ymm1,%ymm1,%ymm1 + vpbroadcastq 0-128(%rsi),%ymm10 + vpaddq %ymm2,%ymm2,%ymm2 + vmovdqa %ymm1,0-128(%r9) + vpaddq %ymm3,%ymm3,%ymm3 + vmovdqa %ymm2,32-128(%r9) + vpaddq %ymm4,%ymm4,%ymm4 + vmovdqa %ymm3,64-128(%r9) + vpaddq %ymm5,%ymm5,%ymm5 + vmovdqa %ymm4,96-128(%r9) + vpaddq %ymm6,%ymm6,%ymm6 + vmovdqa %ymm5,128-128(%r9) + vpaddq %ymm7,%ymm7,%ymm7 + vmovdqa %ymm6,160-128(%r9) + vpaddq %ymm8,%ymm8,%ymm8 + vmovdqa %ymm7,192-128(%r9) + vpxor %ymm9,%ymm9,%ymm9 + vmovdqa %ymm8,224-128(%r9) + + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpbroadcastq 32-128(%rsi),%ymm11 + vmovdqu %ymm9,288-192(%rbx) + vpmuludq %ymm10,%ymm1,%ymm1 + vmovdqu %ymm9,320-448(%r12) + vpmuludq %ymm10,%ymm2,%ymm2 + vmovdqu %ymm9,352-448(%r12) + vpmuludq %ymm10,%ymm3,%ymm3 + vmovdqu %ymm9,384-448(%r12) + vpmuludq %ymm10,%ymm4,%ymm4 + vmovdqu %ymm9,416-448(%r12) + vpmuludq %ymm10,%ymm5,%ymm5 + vmovdqu %ymm9,448-448(%r12) + vpmuludq %ymm10,%ymm6,%ymm6 + vmovdqu %ymm9,480-448(%r12) + vpmuludq %ymm10,%ymm7,%ymm7 + vmovdqu %ymm9,512-448(%r12) + vpmuludq %ymm10,%ymm8,%ymm8 + vpbroadcastq 64-128(%rsi),%ymm10 + vmovdqu %ymm9,544-448(%r12) + + movq %rsi,%r15 + movl $4,%r14d + jmp L$sqr_entry_1024 +.p2align 5 +L$OOP_SQR_1024: + vpbroadcastq 32-128(%r15),%ymm11 + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpaddq 0-192(%rbx),%ymm0,%ymm0 + vpmuludq 0-128(%r9),%ymm10,%ymm1 + vpaddq 32-192(%rbx),%ymm1,%ymm1 + vpmuludq 32-128(%r9),%ymm10,%ymm2 + vpaddq 64-192(%rbx),%ymm2,%ymm2 + vpmuludq 64-128(%r9),%ymm10,%ymm3 + vpaddq 96-192(%rbx),%ymm3,%ymm3 + vpmuludq 96-128(%r9),%ymm10,%ymm4 + vpaddq 128-192(%rbx),%ymm4,%ymm4 + vpmuludq 128-128(%r9),%ymm10,%ymm5 + vpaddq 160-192(%rbx),%ymm5,%ymm5 + vpmuludq 160-128(%r9),%ymm10,%ymm6 + vpaddq 192-192(%rbx),%ymm6,%ymm6 + vpmuludq 192-128(%r9),%ymm10,%ymm7 + vpaddq 224-192(%rbx),%ymm7,%ymm7 + vpmuludq 224-128(%r9),%ymm10,%ymm8 + vpbroadcastq 64-128(%r15),%ymm10 + vpaddq 256-192(%rbx),%ymm8,%ymm8 +L$sqr_entry_1024: + vmovdqu %ymm0,0-192(%rbx) + vmovdqu %ymm1,32-192(%rbx) + + vpmuludq 32-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 32-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 64-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 96-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 128-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 160-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 192-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 224-128(%r9),%ymm11,%ymm0 + vpbroadcastq 96-128(%r15),%ymm11 + vpaddq 288-192(%rbx),%ymm0,%ymm0 + + vmovdqu %ymm2,64-192(%rbx) + vmovdqu %ymm3,96-192(%rbx) + + vpmuludq 64-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 64-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 96-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 128-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 160-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 224-128(%r9),%ymm10,%ymm1 + vpbroadcastq 128-128(%r15),%ymm10 + vpaddq 320-448(%r12),%ymm1,%ymm1 + + vmovdqu %ymm4,128-192(%rbx) + vmovdqu %ymm5,160-192(%rbx) + + vpmuludq 96-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 96-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq 128-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm0,%ymm0 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq 224-128(%r9),%ymm11,%ymm2 + vpbroadcastq 160-128(%r15),%ymm11 + vpaddq 352-448(%r12),%ymm2,%ymm2 + + vmovdqu %ymm6,192-192(%rbx) + vmovdqu %ymm7,224-192(%rbx) + + vpmuludq 128-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 128-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 160-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 192-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 224-128(%r9),%ymm10,%ymm3 + vpbroadcastq 192-128(%r15),%ymm10 + vpaddq 384-448(%r12),%ymm3,%ymm3 + + vmovdqu %ymm8,256-192(%rbx) + vmovdqu %ymm0,288-192(%rbx) + leaq 8(%rbx),%rbx + + vpmuludq 160-128(%rsi),%ymm11,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 224-128(%r9),%ymm11,%ymm4 + vpbroadcastq 224-128(%r15),%ymm11 + vpaddq 416-448(%r12),%ymm4,%ymm4 + + vmovdqu %ymm1,320-448(%r12) + vmovdqu %ymm2,352-448(%r12) + + vpmuludq 192-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpbroadcastq 256-128(%r15),%ymm0 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq 224-128(%r9),%ymm10,%ymm5 + vpbroadcastq 0+8-128(%r15),%ymm10 + vpaddq 448-448(%r12),%ymm5,%ymm5 + + vmovdqu %ymm3,384-448(%r12) + vmovdqu %ymm4,416-448(%r12) + leaq 8(%r15),%r15 + + vpmuludq 224-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 224-128(%r9),%ymm11,%ymm6 + vpaddq 480-448(%r12),%ymm6,%ymm6 + + vpmuludq 256-128(%rsi),%ymm0,%ymm7 + vmovdqu %ymm5,448-448(%r12) + vpaddq 512-448(%r12),%ymm7,%ymm7 + vmovdqu %ymm6,480-448(%r12) + vmovdqu %ymm7,512-448(%r12) + leaq 8(%r12),%r12 + + decl %r14d + jnz L$OOP_SQR_1024 + + vmovdqu 256(%rsp),%ymm8 + vmovdqu 288(%rsp),%ymm1 + vmovdqu 320(%rsp),%ymm2 + leaq 192(%rsp),%rbx + + vpsrlq $29,%ymm8,%ymm14 + vpand %ymm15,%ymm8,%ymm8 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + + vpermq $0x93,%ymm14,%ymm14 + vpxor %ymm9,%ymm9,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm8,%ymm8 + vpblendd $3,%ymm11,%ymm9,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,288-192(%rbx) + vmovdqu %ymm2,320-192(%rbx) + + movq (%rsp),%rax + movq 8(%rsp),%r10 + movq 16(%rsp),%r11 + movq 24(%rsp),%r12 + vmovdqu 32(%rsp),%ymm1 + vmovdqu 64-192(%rbx),%ymm2 + vmovdqu 96-192(%rbx),%ymm3 + vmovdqu 128-192(%rbx),%ymm4 + vmovdqu 160-192(%rbx),%ymm5 + vmovdqu 192-192(%rbx),%ymm6 + vmovdqu 224-192(%rbx),%ymm7 + + movq %rax,%r9 + imull %ecx,%eax + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + + movq %rax,%rdx + imulq -128(%r13),%rax + vpbroadcastq %xmm12,%ymm12 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax + shrq $29,%r9 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + addq %r9,%r10 + addq %rax,%r11 + imulq 24-128(%r13),%rdx + addq %rdx,%r12 + + movq %r10,%rax + imull %ecx,%eax + andl $0x1fffffff,%eax + + movl $9,%r14d + jmp L$OOP_REDUCE_1024 + +.p2align 5 +L$OOP_REDUCE_1024: + vmovd %eax,%xmm13 + vpbroadcastq %xmm13,%ymm13 + + vpmuludq 32-128(%r13),%ymm12,%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm10,%ymm1,%ymm1 + addq %rax,%r10 + vpmuludq 64-128(%r13),%ymm12,%ymm14 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm14,%ymm2,%ymm2 + vpmuludq 96-128(%r13),%ymm12,%ymm11 +.byte 0x67 + addq %rax,%r11 +.byte 0x67 + movq %rdx,%rax + imulq 16-128(%r13),%rax + shrq $29,%r10 + vpaddq %ymm11,%ymm3,%ymm3 + vpmuludq 128-128(%r13),%ymm12,%ymm10 + addq %rax,%r12 + addq %r10,%r11 + vpaddq %ymm10,%ymm4,%ymm4 + vpmuludq 160-128(%r13),%ymm12,%ymm14 + movq %r11,%rax + imull %ecx,%eax + vpaddq %ymm14,%ymm5,%ymm5 + vpmuludq 192-128(%r13),%ymm12,%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm11,%ymm6,%ymm6 + vpmuludq 224-128(%r13),%ymm12,%ymm10 + vpaddq %ymm10,%ymm7,%ymm7 + vpmuludq 256-128(%r13),%ymm12,%ymm14 + vmovd %eax,%xmm12 + + vpaddq %ymm14,%ymm8,%ymm8 + + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 32-8-128(%r13),%ymm13,%ymm11 + vmovdqu 96-8-128(%r13),%ymm14 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm1,%ymm1 + vpmuludq 64-8-128(%r13),%ymm13,%ymm10 + vmovdqu 128-8-128(%r13),%ymm11 + addq %rax,%r11 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm10,%ymm2,%ymm2 + addq %r12,%rax + shrq $29,%r11 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 160-8-128(%r13),%ymm10 + addq %r11,%rax + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 192-8-128(%r13),%ymm14 +.byte 0x67 + movq %rax,%r12 + imull %ecx,%eax + vpaddq %ymm11,%ymm4,%ymm4 + vpmuludq %ymm13,%ymm10,%ymm10 +.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm5,%ymm5 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 256-8-128(%r13),%ymm10 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 288-8-128(%r13),%ymm9 + vmovd %eax,%xmm0 + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm7,%ymm7 + vpmuludq %ymm13,%ymm10,%ymm10 + vmovdqu 32-16-128(%r13),%ymm14 + vpbroadcastq %xmm0,%ymm0 + vpaddq %ymm10,%ymm8,%ymm8 + vpmuludq %ymm13,%ymm9,%ymm9 + vmovdqu 64-16-128(%r13),%ymm11 + addq %rax,%r12 + + vmovdqu 32-24-128(%r13),%ymm13 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 96-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq %ymm0,%ymm13,%ymm13 + vpmuludq %ymm12,%ymm11,%ymm11 +.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff + vpaddq %ymm1,%ymm13,%ymm13 + vpaddq %ymm11,%ymm2,%ymm2 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 160-16-128(%r13),%ymm11 +.byte 0x67 + vmovq %xmm13,%rax + vmovdqu %ymm13,(%rsp) + vpaddq %ymm10,%ymm3,%ymm3 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 192-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq %ymm12,%ymm11,%ymm11 + vmovdqu 224-16-128(%r13),%ymm14 + vpaddq %ymm11,%ymm5,%ymm5 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 256-16-128(%r13),%ymm11 + vpaddq %ymm10,%ymm6,%ymm6 + vpmuludq %ymm12,%ymm14,%ymm14 + shrq $29,%r12 + vmovdqu 288-16-128(%r13),%ymm10 + addq %r12,%rax + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq %ymm12,%ymm11,%ymm11 + + movq %rax,%r9 + imull %ecx,%eax + vpaddq %ymm11,%ymm8,%ymm8 + vpmuludq %ymm12,%ymm10,%ymm10 + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + vmovdqu 96-24-128(%r13),%ymm11 +.byte 0x67 + vpaddq %ymm10,%ymm9,%ymm9 + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 64-24-128(%r13),%ymm0,%ymm14 + vmovdqu 128-24-128(%r13),%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + movq 8(%rsp),%r10 + vpaddq %ymm14,%ymm2,%ymm1 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 160-24-128(%r13),%ymm14 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax +.byte 0x67 + shrq $29,%r9 + movq 16(%rsp),%r11 + vpaddq %ymm11,%ymm3,%ymm2 + vpmuludq %ymm0,%ymm10,%ymm10 + vmovdqu 192-24-128(%r13),%ymm11 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + vpaddq %ymm10,%ymm4,%ymm3 + vpmuludq %ymm0,%ymm14,%ymm14 + vmovdqu 224-24-128(%r13),%ymm10 + imulq 24-128(%r13),%rdx + addq %rax,%r11 + leaq (%r9,%r10,1),%rax + vpaddq %ymm14,%ymm5,%ymm4 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 256-24-128(%r13),%ymm14 + movq %rax,%r10 + imull %ecx,%eax + vpmuludq %ymm0,%ymm10,%ymm10 + vpaddq %ymm11,%ymm6,%ymm5 + vmovdqu 288-24-128(%r13),%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm7,%ymm6 + vpmuludq %ymm0,%ymm14,%ymm14 + addq 24(%rsp),%rdx + vpaddq %ymm14,%ymm8,%ymm7 + vpmuludq %ymm0,%ymm11,%ymm11 + vpaddq %ymm11,%ymm9,%ymm8 + vmovq %r12,%xmm9 + movq %rdx,%r12 + + decl %r14d + jnz L$OOP_REDUCE_1024 + leaq 448(%rsp),%r12 + vpaddq %ymm9,%ymm13,%ymm0 + vpxor %ymm9,%ymm9,%ymm9 + + vpaddq 288-192(%rbx),%ymm0,%ymm0 + vpaddq 320-448(%r12),%ymm1,%ymm1 + vpaddq 352-448(%r12),%ymm2,%ymm2 + vpaddq 384-448(%r12),%ymm3,%ymm3 + vpaddq 416-448(%r12),%ymm4,%ymm4 + vpaddq 448-448(%r12),%ymm5,%ymm5 + vpaddq 480-448(%r12),%ymm6,%ymm6 + vpaddq 512-448(%r12),%ymm7,%ymm7 + vpaddq 544-448(%r12),%ymm8,%ymm8 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vpaddq %ymm13,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vmovdqu %ymm0,0-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,32-128(%rdi) + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vmovdqu %ymm2,64-128(%rdi) + vpaddq %ymm13,%ymm4,%ymm4 + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vpaddq %ymm13,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vmovdqu %ymm4,128-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vmovdqu %ymm5,160-128(%rdi) + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vmovdqu %ymm6,192-128(%rdi) + vpaddq %ymm13,%ymm8,%ymm8 + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + + movq %rdi,%rsi + decl %r8d + jne L$OOP_GRANDE_SQR_1024 + + vzeroall + movq %rbp,%rax + + movq -48(%rax),%r15 + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + +L$sqr_1024_epilogue: + .byte 0xf3,0xc3 + + +.globl _rsaz_1024_mul_avx2 +.private_extern _rsaz_1024_mul_avx2 + +.p2align 6 +_rsaz_1024_mul_avx2: + + leaq (%rsp),%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + movq %rax,%rbp + + vzeroall + movq %rdx,%r13 + subq $64,%rsp + + + + + + +.byte 0x67,0x67 + movq %rsi,%r15 + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + movq %rsi,%r15 + cmovnzq %r13,%rsi + cmovnzq %r15,%r13 + + movq %rcx,%r15 + subq $-128,%rsi + subq $-128,%rcx + subq $-128,%rdi + + andq $4095,%r15 + addq $320,%r15 +.byte 0x67,0x67 + shrq $12,%r15 + jz L$mul_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%rcx),%ymm0 + andq $-512,%rsp + vmovdqu 32-128(%rcx),%ymm1 + vmovdqu 64-128(%rcx),%ymm2 + vmovdqu 96-128(%rcx),%ymm3 + vmovdqu 128-128(%rcx),%ymm4 + vmovdqu 160-128(%rcx),%ymm5 + vmovdqu 192-128(%rcx),%ymm6 + vmovdqu 224-128(%rcx),%ymm7 + vmovdqu 256-128(%rcx),%ymm8 + leaq 64+128(%rsp),%rcx + vmovdqu %ymm0,0-128(%rcx) + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm1,32-128(%rcx) + vpxor %ymm1,%ymm1,%ymm1 + vmovdqu %ymm2,64-128(%rcx) + vpxor %ymm2,%ymm2,%ymm2 + vmovdqu %ymm3,96-128(%rcx) + vpxor %ymm3,%ymm3,%ymm3 + vmovdqu %ymm4,128-128(%rcx) + vpxor %ymm4,%ymm4,%ymm4 + vmovdqu %ymm5,160-128(%rcx) + vpxor %ymm5,%ymm5,%ymm5 + vmovdqu %ymm6,192-128(%rcx) + vpxor %ymm6,%ymm6,%ymm6 + vmovdqu %ymm7,224-128(%rcx) + vpxor %ymm7,%ymm7,%ymm7 + vmovdqu %ymm8,256-128(%rcx) + vmovdqa %ymm0,%ymm8 + vmovdqu %ymm9,288-128(%rcx) +L$mul_1024_no_n_copy: + andq $-64,%rsp + + movq (%r13),%rbx + vpbroadcastq (%r13),%ymm10 + vmovdqu %ymm0,(%rsp) + xorq %r9,%r9 +.byte 0x67 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + + vmovdqu L$and_mask(%rip),%ymm15 + movl $9,%r14d + vmovdqu %ymm9,288-128(%rdi) + jmp L$oop_mul_1024 + +.p2align 5 +L$oop_mul_1024: + vpsrlq $29,%ymm3,%ymm9 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r9,%rax + movq %rbx,%r10 + imulq 8-128(%rsi),%r10 + addq 8(%rsp),%r10 + + movq %rax,%r9 + imull %r8d,%eax + andl $0x1fffffff,%eax + + movq %rbx,%r11 + imulq 16-128(%rsi),%r11 + addq 16(%rsp),%r11 + + movq %rbx,%r12 + imulq 24-128(%rsi),%r12 + addq 24(%rsp),%r12 + vpmuludq 32-128(%rsi),%ymm10,%ymm0 + vmovd %eax,%xmm11 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq 64-128(%rsi),%ymm10,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 96-128(%rsi),%ymm10,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq 128-128(%rsi),%ymm10,%ymm0 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq 160-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 192-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq 224-128(%rsi),%ymm10,%ymm0 + vpermq $0x93,%ymm9,%ymm9 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq 256-128(%rsi),%ymm10,%ymm12 + vpbroadcastq 8(%r13),%ymm10 + vpaddq %ymm12,%ymm8,%ymm8 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%rcx),%rax + addq %rax,%r11 + shrq $29,%r9 + imulq 24-128(%rcx),%rdx + addq %rdx,%r12 + addq %r9,%r10 + + vpmuludq 32-128(%rcx),%ymm11,%ymm13 + vmovq %xmm10,%rbx + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 64-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm2,%ymm2 + vpmuludq 96-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 128-128(%rcx),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 160-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm5,%ymm5 + vpmuludq 192-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 224-128(%rcx),%ymm11,%ymm13 + vpblendd $3,%ymm14,%ymm9,%ymm9 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 256-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm9,%ymm3,%ymm3 + vpaddq %ymm0,%ymm8,%ymm8 + + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rsi),%ymm12 + movq %rbx,%rax + imulq 8-128(%rsi),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rsi),%ymm13 + + movq %r10,%rax + imull %r8d,%eax + andl $0x1fffffff,%eax + + imulq 16-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovd %eax,%xmm11 + vmovdqu -8+96-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -8+128-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+160-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+192-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -8+224-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+256-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+288-128(%rsi),%ymm9 + vpaddq %ymm12,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm13,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm9,%ymm9 + vpbroadcastq 16(%r13),%ymm10 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rcx),%ymm0 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rcx),%ymm12 + shrq $29,%r10 + imulq 16-128(%rcx),%rdx + addq %rdx,%r12 + addq %r10,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -8+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rsi),%ymm0 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r11,%rax + + vmovdqu -16+64-128(%rsi),%ymm12 + movq %rax,%r11 + imull %r8d,%eax + andl $0x1fffffff,%eax + + imulq 8-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -16+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -16+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -16+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 24(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rcx),%ymm0 + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r11 + vmovdqu -16+64-128(%rcx),%ymm12 + imulq 8-128(%rcx),%rdx + addq %rdx,%r12 + shrq $29,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -16+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+32-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+64-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm9,%ymm9 + + addq %r11,%r12 + imulq -128(%rsi),%rbx + addq %rbx,%r12 + + movq %r12,%rax + imull %r8d,%eax + andl $0x1fffffff,%eax + + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -24+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -24+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -24+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 32(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + addq $32,%r13 + + vmovdqu -24+32-128(%rcx),%ymm0 + imulq -128(%rcx),%rax + addq %rax,%r12 + shrq $29,%r12 + + vmovdqu -24+64-128(%rcx),%ymm12 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -24+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm0 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu %ymm0,(%rsp) + vpaddq %ymm12,%ymm2,%ymm1 + vmovdqu -24+128-128(%rcx),%ymm0 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm2 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm3 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm4 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm5 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+288-128(%rcx),%ymm13 + movq %r12,%r9 + vpaddq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm11,%ymm12,%ymm12 + addq (%rsp),%r9 + vpaddq %ymm12,%ymm8,%ymm7 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovq %r12,%xmm12 + vpaddq %ymm13,%ymm9,%ymm8 + + decl %r14d + jnz L$oop_mul_1024 + vpermq $0,%ymm15,%ymm15 + vpaddq (%rsp),%ymm12,%ymm0 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm10,%ymm10 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpermq $0x93,%ymm11,%ymm11 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm10,%ymm10 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vmovdqu %ymm0,0-128(%rdi) + vmovdqu %ymm1,32-128(%rdi) + vmovdqu %ymm2,64-128(%rdi) + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vmovdqu %ymm4,128-128(%rdi) + vmovdqu %ymm5,160-128(%rdi) + vmovdqu %ymm6,192-128(%rdi) + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + vzeroupper + + movq %rbp,%rax + + movq -48(%rax),%r15 + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + +L$mul_1024_epilogue: + .byte 0xf3,0xc3 + + +.globl _rsaz_1024_red2norm_avx2 +.private_extern _rsaz_1024_red2norm_avx2 + +.p2align 5 +_rsaz_1024_red2norm_avx2: + subq $-128,%rsi + xorq %rax,%rax + movq -128(%rsi),%r8 + movq -120(%rsi),%r9 + movq -112(%rsi),%r10 + shlq $0,%r8 + shlq $29,%r9 + movq %r10,%r11 + shlq $58,%r10 + shrq $6,%r11 + addq %r8,%rax + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,0(%rdi) + movq %r11,%rax + movq -104(%rsi),%r8 + movq -96(%rsi),%r9 + shlq $23,%r8 + movq %r9,%r10 + shlq $52,%r9 + shrq $12,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,8(%rdi) + movq %r10,%rax + movq -88(%rsi),%r11 + movq -80(%rsi),%r8 + shlq $17,%r11 + movq %r8,%r9 + shlq $46,%r8 + shrq $18,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,16(%rdi) + movq %r9,%rax + movq -72(%rsi),%r10 + movq -64(%rsi),%r11 + shlq $11,%r10 + movq %r11,%r8 + shlq $40,%r11 + shrq $24,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,24(%rdi) + movq %r8,%rax + movq -56(%rsi),%r9 + movq -48(%rsi),%r10 + movq -40(%rsi),%r11 + shlq $5,%r9 + shlq $34,%r10 + movq %r11,%r8 + shlq $63,%r11 + shrq $1,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,32(%rdi) + movq %r8,%rax + movq -32(%rsi),%r9 + movq -24(%rsi),%r10 + shlq $28,%r9 + movq %r10,%r11 + shlq $57,%r10 + shrq $7,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,40(%rdi) + movq %r11,%rax + movq -16(%rsi),%r8 + movq -8(%rsi),%r9 + shlq $22,%r8 + movq %r9,%r10 + shlq $51,%r9 + shrq $13,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,48(%rdi) + movq %r10,%rax + movq 0(%rsi),%r11 + movq 8(%rsi),%r8 + shlq $16,%r11 + movq %r8,%r9 + shlq $45,%r8 + shrq $19,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,56(%rdi) + movq %r9,%rax + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + shlq $10,%r10 + movq %r11,%r8 + shlq $39,%r11 + shrq $25,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,64(%rdi) + movq %r8,%rax + movq 32(%rsi),%r9 + movq 40(%rsi),%r10 + movq 48(%rsi),%r11 + shlq $4,%r9 + shlq $33,%r10 + movq %r11,%r8 + shlq $62,%r11 + shrq $2,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,72(%rdi) + movq %r8,%rax + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + shlq $27,%r9 + movq %r10,%r11 + shlq $56,%r10 + shrq $8,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,80(%rdi) + movq %r11,%rax + movq 72(%rsi),%r8 + movq 80(%rsi),%r9 + shlq $21,%r8 + movq %r9,%r10 + shlq $50,%r9 + shrq $14,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,88(%rdi) + movq %r10,%rax + movq 88(%rsi),%r11 + movq 96(%rsi),%r8 + shlq $15,%r11 + movq %r8,%r9 + shlq $44,%r8 + shrq $20,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,96(%rdi) + movq %r9,%rax + movq 104(%rsi),%r10 + movq 112(%rsi),%r11 + shlq $9,%r10 + movq %r11,%r8 + shlq $38,%r11 + shrq $26,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,104(%rdi) + movq %r8,%rax + movq 120(%rsi),%r9 + movq 128(%rsi),%r10 + movq 136(%rsi),%r11 + shlq $3,%r9 + shlq $32,%r10 + movq %r11,%r8 + shlq $61,%r11 + shrq $3,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,112(%rdi) + movq %r8,%rax + movq 144(%rsi),%r9 + movq 152(%rsi),%r10 + shlq $26,%r9 + movq %r10,%r11 + shlq $55,%r10 + shrq $9,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,120(%rdi) + movq %r11,%rax + .byte 0xf3,0xc3 + + +.globl _rsaz_1024_norm2red_avx2 +.private_extern _rsaz_1024_norm2red_avx2 + +.p2align 5 +_rsaz_1024_norm2red_avx2: + subq $-128,%rdi + movq (%rsi),%r8 + movl $0x1fffffff,%eax + movq 8(%rsi),%r9 + movq %r8,%r11 + shrq $0,%r11 + andq %rax,%r11 + movq %r11,-128(%rdi) + movq %r8,%r10 + shrq $29,%r10 + andq %rax,%r10 + movq %r10,-120(%rdi) + shrdq $58,%r9,%r8 + andq %rax,%r8 + movq %r8,-112(%rdi) + movq 16(%rsi),%r10 + movq %r9,%r8 + shrq $23,%r8 + andq %rax,%r8 + movq %r8,-104(%rdi) + shrdq $52,%r10,%r9 + andq %rax,%r9 + movq %r9,-96(%rdi) + movq 24(%rsi),%r11 + movq %r10,%r9 + shrq $17,%r9 + andq %rax,%r9 + movq %r9,-88(%rdi) + shrdq $46,%r11,%r10 + andq %rax,%r10 + movq %r10,-80(%rdi) + movq 32(%rsi),%r8 + movq %r11,%r10 + shrq $11,%r10 + andq %rax,%r10 + movq %r10,-72(%rdi) + shrdq $40,%r8,%r11 + andq %rax,%r11 + movq %r11,-64(%rdi) + movq 40(%rsi),%r9 + movq %r8,%r11 + shrq $5,%r11 + andq %rax,%r11 + movq %r11,-56(%rdi) + movq %r8,%r10 + shrq $34,%r10 + andq %rax,%r10 + movq %r10,-48(%rdi) + shrdq $63,%r9,%r8 + andq %rax,%r8 + movq %r8,-40(%rdi) + movq 48(%rsi),%r10 + movq %r9,%r8 + shrq $28,%r8 + andq %rax,%r8 + movq %r8,-32(%rdi) + shrdq $57,%r10,%r9 + andq %rax,%r9 + movq %r9,-24(%rdi) + movq 56(%rsi),%r11 + movq %r10,%r9 + shrq $22,%r9 + andq %rax,%r9 + movq %r9,-16(%rdi) + shrdq $51,%r11,%r10 + andq %rax,%r10 + movq %r10,-8(%rdi) + movq 64(%rsi),%r8 + movq %r11,%r10 + shrq $16,%r10 + andq %rax,%r10 + movq %r10,0(%rdi) + shrdq $45,%r8,%r11 + andq %rax,%r11 + movq %r11,8(%rdi) + movq 72(%rsi),%r9 + movq %r8,%r11 + shrq $10,%r11 + andq %rax,%r11 + movq %r11,16(%rdi) + shrdq $39,%r9,%r8 + andq %rax,%r8 + movq %r8,24(%rdi) + movq 80(%rsi),%r10 + movq %r9,%r8 + shrq $4,%r8 + andq %rax,%r8 + movq %r8,32(%rdi) + movq %r9,%r11 + shrq $33,%r11 + andq %rax,%r11 + movq %r11,40(%rdi) + shrdq $62,%r10,%r9 + andq %rax,%r9 + movq %r9,48(%rdi) + movq 88(%rsi),%r11 + movq %r10,%r9 + shrq $27,%r9 + andq %rax,%r9 + movq %r9,56(%rdi) + shrdq $56,%r11,%r10 + andq %rax,%r10 + movq %r10,64(%rdi) + movq 96(%rsi),%r8 + movq %r11,%r10 + shrq $21,%r10 + andq %rax,%r10 + movq %r10,72(%rdi) + shrdq $50,%r8,%r11 + andq %rax,%r11 + movq %r11,80(%rdi) + movq 104(%rsi),%r9 + movq %r8,%r11 + shrq $15,%r11 + andq %rax,%r11 + movq %r11,88(%rdi) + shrdq $44,%r9,%r8 + andq %rax,%r8 + movq %r8,96(%rdi) + movq 112(%rsi),%r10 + movq %r9,%r8 + shrq $9,%r8 + andq %rax,%r8 + movq %r8,104(%rdi) + shrdq $38,%r10,%r9 + andq %rax,%r9 + movq %r9,112(%rdi) + movq 120(%rsi),%r11 + movq %r10,%r9 + shrq $3,%r9 + andq %rax,%r9 + movq %r9,120(%rdi) + movq %r10,%r8 + shrq $32,%r8 + andq %rax,%r8 + movq %r8,128(%rdi) + shrdq $61,%r11,%r10 + andq %rax,%r10 + movq %r10,136(%rdi) + xorq %r8,%r8 + movq %r11,%r10 + shrq $26,%r10 + andq %rax,%r10 + movq %r10,144(%rdi) + shrdq $55,%r8,%r11 + andq %rax,%r11 + movq %r11,152(%rdi) + movq %r8,160(%rdi) + movq %r8,168(%rdi) + movq %r8,176(%rdi) + movq %r8,184(%rdi) + .byte 0xf3,0xc3 + +.globl _rsaz_1024_scatter5_avx2 +.private_extern _rsaz_1024_scatter5_avx2 + +.p2align 5 +_rsaz_1024_scatter5_avx2: + vzeroupper + vmovdqu L$scatter_permd(%rip),%ymm5 + shll $4,%edx + leaq (%rdi,%rdx,1),%rdi + movl $9,%eax + jmp L$oop_scatter_1024 + +.p2align 5 +L$oop_scatter_1024: + vmovdqu (%rsi),%ymm0 + leaq 32(%rsi),%rsi + vpermd %ymm0,%ymm5,%ymm0 + vmovdqu %xmm0,(%rdi) + leaq 512(%rdi),%rdi + decl %eax + jnz L$oop_scatter_1024 + + vzeroupper + .byte 0xf3,0xc3 + + +.globl _rsaz_1024_gather5_avx2 +.private_extern _rsaz_1024_gather5_avx2 + +.p2align 5 +_rsaz_1024_gather5_avx2: + + vzeroupper + movq %rsp,%r11 + + leaq -256(%rsp),%rsp + andq $-32,%rsp + leaq L$inc(%rip),%r10 + leaq -128(%rsp),%rax + + vmovd %edx,%xmm4 + vmovdqa (%r10),%ymm0 + vmovdqa 32(%r10),%ymm1 + vmovdqa 64(%r10),%ymm5 + vpbroadcastd %xmm4,%ymm4 + + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,0+128(%rax) + vpaddd %ymm5,%ymm2,%ymm0 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,32+128(%rax) + vpaddd %ymm5,%ymm3,%ymm1 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,64+128(%rax) + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vmovdqa %ymm3,96+128(%rax) + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,128+128(%rax) + vpaddd %ymm5,%ymm2,%ymm8 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,160+128(%rax) + vpaddd %ymm5,%ymm3,%ymm9 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,192+128(%rax) + vpaddd %ymm5,%ymm8,%ymm10 + vpcmpeqd %ymm4,%ymm8,%ymm8 + vmovdqa %ymm3,224+128(%rax) + vpaddd %ymm5,%ymm9,%ymm11 + vpcmpeqd %ymm4,%ymm9,%ymm9 + vpaddd %ymm5,%ymm10,%ymm12 + vpcmpeqd %ymm4,%ymm10,%ymm10 + vpaddd %ymm5,%ymm11,%ymm13 + vpcmpeqd %ymm4,%ymm11,%ymm11 + vpaddd %ymm5,%ymm12,%ymm14 + vpcmpeqd %ymm4,%ymm12,%ymm12 + vpaddd %ymm5,%ymm13,%ymm15 + vpcmpeqd %ymm4,%ymm13,%ymm13 + vpcmpeqd %ymm4,%ymm14,%ymm14 + vpcmpeqd %ymm4,%ymm15,%ymm15 + + vmovdqa -32(%r10),%ymm7 + leaq 128(%rsi),%rsi + movl $9,%edx + +L$oop_gather_1024: + vmovdqa 0-128(%rsi),%ymm0 + vmovdqa 32-128(%rsi),%ymm1 + vmovdqa 64-128(%rsi),%ymm2 + vmovdqa 96-128(%rsi),%ymm3 + vpand 0+128(%rax),%ymm0,%ymm0 + vpand 32+128(%rax),%ymm1,%ymm1 + vpand 64+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm1,%ymm4 + vpand 96+128(%rax),%ymm3,%ymm3 + vmovdqa 128-128(%rsi),%ymm0 + vmovdqa 160-128(%rsi),%ymm1 + vpor %ymm2,%ymm3,%ymm5 + vmovdqa 192-128(%rsi),%ymm2 + vmovdqa 224-128(%rsi),%ymm3 + vpand 128+128(%rax),%ymm0,%ymm0 + vpand 160+128(%rax),%ymm1,%ymm1 + vpand 192+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm4,%ymm4 + vpand 224+128(%rax),%ymm3,%ymm3 + vpand 256-128(%rsi),%ymm8,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 288-128(%rsi),%ymm9,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 320-128(%rsi),%ymm10,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 352-128(%rsi),%ymm11,%ymm3 + vpor %ymm0,%ymm4,%ymm4 + vpand 384-128(%rsi),%ymm12,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 416-128(%rsi),%ymm13,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 448-128(%rsi),%ymm14,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 480-128(%rsi),%ymm15,%ymm3 + leaq 512(%rsi),%rsi + vpor %ymm0,%ymm4,%ymm4 + vpor %ymm1,%ymm5,%ymm5 + vpor %ymm2,%ymm4,%ymm4 + vpor %ymm3,%ymm5,%ymm5 + + vpor %ymm5,%ymm4,%ymm4 + vextracti128 $1,%ymm4,%xmm5 + vpor %xmm4,%xmm5,%xmm5 + vpermd %ymm5,%ymm7,%ymm5 + vmovdqu %ymm5,(%rdi) + leaq 32(%rdi),%rdi + decl %edx + jnz L$oop_gather_1024 + + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + vzeroupper + leaq (%r11),%rsp + + .byte 0xf3,0xc3 + +L$SEH_end_rsaz_1024_gather5: + + +.globl _rsaz_avx2_eligible +.private_extern _rsaz_avx2_eligible + +.p2align 5 +_rsaz_avx2_eligible: + leaq _OPENSSL_ia32cap_P(%rip),%rax + movl 8(%rax),%eax + andl $32,%eax + shrl $5,%eax + .byte 0xf3,0xc3 + + +.p2align 6 +L$and_mask: +.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1 +L$scatter_permd: +.long 0,2,4,6,7,7,7,7 +L$gather_permd: +.long 0,7,1,7,2,7,3,7 +L$inc: +.long 0,0,0,0, 1,1,1,1 +.long 2,2,2,2, 3,3,3,3 +.long 4,4,4,4, 4,4,4,4 +.p2align 6 +#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha1-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S similarity index 98% rename from packager/third_party/boringssl/mac-x86_64/crypto/sha/sha1-x86_64.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S index 0509d45163..c22431c89f 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha1-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha1-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text @@ -7,9 +7,10 @@ .p2align 4 _sha1_block_data_order: - movl _OPENSSL_ia32cap_P+0(%rip),%r9d - movl _OPENSSL_ia32cap_P+4(%rip),%r8d - movl _OPENSSL_ia32cap_P+8(%rip),%r10d + leaq _OPENSSL_ia32cap_P(%rip),%r10 + movl 0(%r10),%r9d + movl 4(%r10),%r8d + movl 8(%r10),%r10d testl $512,%r8d jz L$ialu andl $268435456,%r8d @@ -1240,14 +1241,13 @@ L$epilogue: .p2align 4 sha1_block_data_order_ssse3: _ssse3_shortcut: - movq %rsp,%rax + movq %rsp,%r11 pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 leaq -64(%rsp),%rsp - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -1255,7 +1255,7 @@ _ssse3_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -1267,8 +1267,8 @@ _ssse3_shortcut: xorl %edx,%edi andl %edi,%esi - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -1344,7 +1344,7 @@ L$oop_ssse3: pslld $2,%xmm9 pxor %xmm10,%xmm4 xorl %ebp,%edx - movdqa -64(%r11),%xmm10 + movdqa -64(%r14),%xmm10 roll $5,%ecx addl %edi,%ebx andl %edx,%esi @@ -1405,7 +1405,7 @@ L$oop_ssse3: pslld $2,%xmm10 pxor %xmm8,%xmm5 xorl %eax,%ebp - movdqa -32(%r11),%xmm8 + movdqa -32(%r14),%xmm8 roll $5,%edx addl %edi,%ecx andl %ebp,%esi @@ -1466,7 +1466,7 @@ L$oop_ssse3: pslld $2,%xmm8 pxor %xmm9,%xmm6 xorl %ebx,%eax - movdqa -32(%r11),%xmm9 + movdqa -32(%r14),%xmm9 roll $5,%ebp addl %edi,%edx andl %eax,%esi @@ -1527,7 +1527,7 @@ L$oop_ssse3: pslld $2,%xmm9 pxor %xmm10,%xmm7 xorl %ecx,%ebx - movdqa -32(%r11),%xmm10 + movdqa -32(%r14),%xmm10 roll $5,%eax addl %edi,%ebp andl %ebx,%esi @@ -1638,7 +1638,7 @@ L$oop_ssse3: pxor %xmm3,%xmm2 addl %esi,%eax xorl %edx,%edi - movdqa 0(%r11),%xmm10 + movdqa 0(%r14),%xmm10 rorl $7,%ecx paddd %xmm1,%xmm9 addl %ebx,%eax @@ -1873,7 +1873,7 @@ L$oop_ssse3: pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax - movdqa 32(%r11),%xmm9 + movdqa 32(%r14),%xmm9 xorl %ecx,%edi paddd %xmm6,%xmm8 xorl %edx,%ecx @@ -2164,8 +2164,8 @@ L$oop_ssse3: addl %edx,%ecx cmpq %r10,%r9 je L$done_ssse3 - movdqa 64(%r11),%xmm6 - movdqa -64(%r11),%xmm9 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 @@ -2402,13 +2402,12 @@ L$done_ssse3: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp L$epilogue_ssse3: .byte 0xf3,0xc3 @@ -2416,7 +2415,7 @@ L$epilogue_ssse3: .p2align 4 sha1_block_data_order_avx: _avx_shortcut: - movq %rsp,%rax + movq %rsp,%r11 pushq %rbx pushq %rbp pushq %r12 @@ -2424,7 +2423,6 @@ _avx_shortcut: pushq %r14 leaq -64(%rsp),%rsp vzeroupper - movq %rax,%r14 andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 @@ -2432,7 +2430,7 @@ _avx_shortcut: shlq $6,%r10 addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r11 + leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx @@ -2444,8 +2442,8 @@ _avx_shortcut: xorl %edx,%edi andl %edi,%esi - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -2570,7 +2568,7 @@ L$oop_avx: vpxor %xmm10,%xmm5,%xmm5 xorl %eax,%ebp shldl $5,%edx,%edx - vmovdqa -32(%r11),%xmm11 + vmovdqa -32(%r14),%xmm11 addl %edi,%ecx andl %ebp,%esi xorl %eax,%ebp @@ -2783,7 +2781,7 @@ L$oop_avx: addl %esi,%eax xorl %edx,%edi vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r11),%xmm11 + vmovdqa 0(%r14),%xmm11 shrdl $7,%ecx,%ecx addl %ebx,%eax vpxor %xmm8,%xmm2,%xmm2 @@ -3002,7 +3000,7 @@ L$oop_avx: movl %ebx,%edi xorl %edx,%esi vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r11),%xmm11 + vmovdqa 32(%r14),%xmm11 shldl $5,%ebx,%ebx addl %esi,%eax vpxor %xmm8,%xmm7,%xmm7 @@ -3281,8 +3279,8 @@ L$oop_avx: addl %edx,%ecx cmpq %r10,%r9 je L$done_avx - vmovdqa 64(%r11),%xmm6 - vmovdqa -64(%r11),%xmm11 + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 vmovdqu 0(%r9),%xmm0 vmovdqu 16(%r9),%xmm1 vmovdqu 32(%r9),%xmm2 @@ -3518,13 +3516,12 @@ L$done_avx: movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) - leaq (%r14),%rsi - movq -40(%rsi),%r14 - movq -32(%rsi),%r13 - movq -24(%rsi),%r12 - movq -16(%rsi),%rbp - movq -8(%rsi),%rbx - leaq (%rsi),%rsp + movq -40(%r11),%r14 + movq -32(%r11),%r13 + movq -24(%r11),%r12 + movq -16(%r11),%rbp + movq -8(%r11),%rbx + leaq (%r11),%rsp L$epilogue_avx: .byte 0xf3,0xc3 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S similarity index 99% rename from packager/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S index 0146ff5cfc..ac6559e074 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha256-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha256-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text @@ -18,13 +18,13 @@ _sha256_block_data_order: je L$avx_shortcut testl $512,%r10d jnz L$ssse3_shortcut + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx @@ -32,7 +32,7 @@ _sha256_block_data_order: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) L$prologue: movl 0(%rdi),%eax @@ -1697,13 +1697,13 @@ L$rounds_16_xx: jb L$loop movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue: .byte 0xf3,0xc3 @@ -1754,13 +1754,13 @@ K256: .p2align 6 sha256_block_data_order_ssse3: L$ssse3_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -1768,7 +1768,7 @@ L$ssse3_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) L$prologue_ssse3: movl 0(%rdi),%eax @@ -2835,13 +2835,13 @@ L$ssse3_00_47: jb L$loop_ssse3 movq 64+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue_ssse3: .byte 0xf3,0xc3 @@ -2849,13 +2849,13 @@ L$epilogue_ssse3: .p2align 6 sha256_block_data_order_avx: L$avx_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx @@ -2863,7 +2863,7 @@ L$avx_shortcut: movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) - movq %r11,64+24(%rsp) + movq %rax,64+24(%rsp) L$prologue_avx: vzeroupper @@ -3892,13 +3892,13 @@ L$avx_00_47: movq 64+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue_avx: .byte 0xf3,0xc3 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha512-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S similarity index 99% rename from packager/third_party/boringssl/mac-x86_64/crypto/sha/sha512-x86_64.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S index aeabd3f43a..0b738e6f45 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/sha/sha512-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/sha512-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text @@ -18,13 +18,13 @@ _sha512_block_data_order: orl %r9d,%r10d cmpl $1342177792,%r10d je L$avx_shortcut + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx @@ -32,7 +32,7 @@ _sha512_block_data_order: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) L$prologue: movq 0(%rdi),%rax @@ -1697,13 +1697,13 @@ L$rounds_16_xx: jb L$loop movq 128+24(%rsp),%rsi - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue: .byte 0xf3,0xc3 @@ -1798,13 +1798,13 @@ K512: .p2align 6 sha512_block_data_order_xop: L$xop_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -1812,7 +1812,7 @@ L$xop_shortcut: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) L$prologue_xop: vzeroupper @@ -2867,13 +2867,13 @@ L$xop_00_47: movq 128+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue_xop: .byte 0xf3,0xc3 @@ -2881,13 +2881,13 @@ L$epilogue_xop: .p2align 6 sha512_block_data_order_avx: L$avx_shortcut: + movq %rsp,%rax pushq %rbx pushq %rbp pushq %r12 pushq %r13 pushq %r14 pushq %r15 - movq %rsp,%r11 shlq $4,%rdx subq $160,%rsp leaq (%rsi,%rdx,8),%rdx @@ -2895,7 +2895,7 @@ L$avx_shortcut: movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) - movq %r11,128+24(%rsp) + movq %rax,128+24(%rsp) L$prologue_avx: vzeroupper @@ -4014,13 +4014,13 @@ L$avx_00_47: movq 128+24(%rsp),%rsi vzeroupper - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp L$epilogue_avx: .byte 0xf3,0xc3 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/aes/vpaes-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S similarity index 99% rename from packager/third_party/boringssl/mac-x86_64/crypto/aes/vpaes-x86_64.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S index 997cde807a..867df68b4b 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/aes/vpaes-x86_64.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/vpaes-x86_64.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S similarity index 82% rename from packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S index 51e5d19931..4904417a20 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text @@ -8,6 +8,10 @@ .p2align 4 _bn_mul_mont: + + movl %r9d,%r9d + movq %rsp,%rax + testl $3,%r9d jnz L$mul_enter cmpl $8,%r9d @@ -21,20 +25,50 @@ _bn_mul_mont: .p2align 4 L$mul_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - movl %r9d,%r9d - leaq 2(%r9),%r10 - movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp - movq %r11,8(%rsp,%r9,8) + negq %r9 + movq %rsp,%r11 + leaq -16(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk + jmp L$mul_page_walk_done + +.p2align 4 +L$mul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk +L$mul_page_walk_done: + + movq %rax,8(%rsp,%r9,8) + L$mul_body: movq %rdx,%r12 movq (%r8),%r8 @@ -177,7 +211,8 @@ L$inner_enter: movq %r9,%r15 jmp L$sub .p2align 4 -L$sub: sbbq (%rcx,%r14,8),%rax +L$sub: + sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsi,%r14,8),%rax leaq 1(%r14),%r14 @@ -186,51 +221,86 @@ L$sub: sbbq (%rcx,%r14,8),%rax sbbq $0,%rax xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx movq %r9,%r15 + orq %rcx,%rsi .p2align 4 L$copy: - movq (%rsp,%r14,8),%rsi - movq (%rdi,%r14,8),%rcx - xorq %rcx,%rsi - andq %rax,%rsi - xorq %rcx,%rsi + movq (%rsi,%r14,8),%rax movq %r14,(%rsp,%r14,8) - movq %rsi,(%rdi,%r14,8) + movq %rax,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz L$copy movq 8(%rsp,%r9,8),%rsi + movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$mul_epilogue: .byte 0xf3,0xc3 + .p2align 4 bn_mul4x_mont: -L$mul4x_enter: - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 movl %r9d,%r9d - leaq 4(%r9),%r10 - movq %rsp,%r11 - negq %r10 - leaq (%rsp,%r10,8),%rsp - andq $-1024,%rsp + movq %rsp,%rax + +L$mul4x_enter: + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + negq %r9 + movq %rsp,%r11 + leaq -32(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul4x_page_walk + jmp L$mul4x_page_walk_done + +L$mul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul4x_page_walk +L$mul4x_page_walk_done: + + movq %rax,8(%rsp,%r9,8) - movq %r11,8(%rsp,%r9,8) L$mul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 @@ -530,9 +600,11 @@ L$inner4x: cmpq %r9,%r14 jb L$outer4x movq 16(%rsp,%r9,8),%rdi + leaq -4(%r9),%r15 movq 0(%rsp),%rax + pxor %xmm0,%xmm0 movq 8(%rsp),%rdx - shrq $2,%r9 + shrq $2,%r15 leaq (%rsp),%rsi xorq %r14,%r14 @@ -540,7 +612,6 @@ L$inner4x: movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx - leaq -1(%r9),%r15 jmp L$sub4x .p2align 4 L$sub4x: @@ -568,62 +639,79 @@ L$sub4x: movq %rbx,16(%rdi,%r14,8) sbbq $0,%rax - movq %rax,%xmm0 - punpcklqdq %xmm0,%xmm0 movq %rbp,24(%rdi,%r14,8) xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx + leaq -4(%r9),%r15 + orq %rcx,%rsi + shrq $2,%r15 - movq %r9,%r15 - pxor %xmm5,%xmm5 + movdqu (%rsi),%xmm1 + movdqa %xmm0,(%rsp) + movdqu %xmm1,(%rdi) jmp L$copy4x .p2align 4 L$copy4x: - movdqu (%rsp,%r14,1),%xmm2 - movdqu 16(%rsp,%r14,1),%xmm4 - movdqu (%rdi,%r14,1),%xmm1 - movdqu 16(%rdi,%r14,1),%xmm3 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - pand %xmm0,%xmm2 - pand %xmm0,%xmm4 - pxor %xmm1,%xmm2 - pxor %xmm3,%xmm4 - movdqu %xmm2,(%rdi,%r14,1) - movdqu %xmm4,16(%rdi,%r14,1) - movdqa %xmm5,(%rsp,%r14,1) - movdqa %xmm5,16(%rsp,%r14,1) - + movdqu 16(%rsi,%r14,1),%xmm2 + movdqu 32(%rsi,%r14,1),%xmm1 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) + movdqa %xmm0,32(%rsp,%r14,1) + movdqu %xmm1,32(%rdi,%r14,1) leaq 32(%r14),%r14 decq %r15 jnz L$copy4x - shlq $2,%r9 + movdqu 16(%rsi,%r14,1),%xmm2 + movdqa %xmm0,16(%rsp,%r14,1) + movdqu %xmm2,16(%rdi,%r14,1) movq 8(%rsp,%r9,8),%rsi + movq $1,%rax - movq (%rsi),%r15 - movq 8(%rsi),%r14 - movq 16(%rsi),%r13 - movq 24(%rsi),%r12 - movq 32(%rsi),%rbp - movq 40(%rsi),%rbx - leaq 48(%rsi),%rsp + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + L$mul4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 bn_sqr8x_mont: -L$sqr8x_enter: + movq %rsp,%rax + +L$sqr8x_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +L$sqr8x_prologue: + movl %r9d,%r10d shll $3,%r9d shlq $3+2,%r10 @@ -635,30 +723,49 @@ L$sqr8x_enter: leaq -64(%rsp,%r9,2),%r11 + movq %rsp,%rbp movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$sqr8x_sp_alt - subq %r11,%rsp - leaq -64(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -64(%rbp,%r9,2),%rbp jmp L$sqr8x_sp_done .p2align 5 L$sqr8x_sp_alt: leaq 4096-64(,%r9,2),%r10 - leaq -64(%rsp,%r9,2),%rsp + leaq -64(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp L$sqr8x_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$sqr8x_page_walk + jmp L$sqr8x_page_walk_done + +.p2align 4 +L$sqr8x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$sqr8x_page_walk +L$sqr8x_page_walk_done: + movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) + L$sqr8x_body: .byte 102,72,15,110,209 @@ -705,6 +812,7 @@ L$sqr8x_sub: pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi + jmp L$sqr8x_cond_copy .p2align 5 @@ -734,15 +842,23 @@ L$sqr8x_cond_copy: movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$sqr8x_epilogue: .byte 0xf3,0xc3 + .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .p2align 4 #endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S similarity index 94% rename from packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S rename to packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S index f3ad8d783f..abc65f1192 100644 --- a/packager/third_party/boringssl/mac-x86_64/crypto/bn/x86_64-mont5.S +++ b/packager/third_party/boringssl/mac-x86_64/crypto/fipsmodule/x86_64-mont5.S @@ -1,4 +1,4 @@ -#if defined(__x86_64__) +#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) .text @@ -8,30 +8,64 @@ .p2align 6 _bn_mul_mont_gather5: + + movl %r9d,%r9d + movq %rsp,%rax + testl $7,%r9d jnz L$mul_enter jmp L$mul4x_enter .p2align 4 L$mul_enter: - movl %r9d,%r9d - movq %rsp,%rax movd 8(%rsp),%xmm5 - leaq L$inc(%rip),%r10 pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 - leaq 2(%r9),%r11 - negq %r11 - leaq -264(%rsp,%r11,8),%rsp - andq $-1024,%rsp + negq %r9 + movq %rsp,%r11 + leaq -280(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk + jmp L$mul_page_walk_done + +L$mul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk +L$mul_page_walk_done: + + leaq L$inc(%rip),%r10 movq %rax,8(%rsp,%r9,8) + L$mul_body: + leaq 128(%rdx),%r12 movdqa 0(%r10),%xmm0 movdqa 16(%r10),%xmm1 @@ -361,7 +395,8 @@ L$inner_enter: movq %r9,%r15 jmp L$sub .p2align 4 -L$sub: sbbq (%rcx,%r14,8),%rax +L$sub: + sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsi,%r14,8),%rax leaq 1(%r14),%r14 @@ -370,46 +405,65 @@ L$sub: sbbq (%rcx,%r14,8),%rax sbbq $0,%rax xorq %r14,%r14 + andq %rax,%rsi + notq %rax + movq %rdi,%rcx + andq %rax,%rcx movq %r9,%r15 + orq %rcx,%rsi .p2align 4 L$copy: - movq (%rsp,%r14,8),%rsi - movq (%rdi,%r14,8),%rcx - xorq %rcx,%rsi - andq %rax,%rsi - xorq %rcx,%rsi + movq (%rsi,%r14,8),%rax movq %r14,(%rsp,%r14,8) - movq %rsi,(%rdi,%r14,8) + movq %rax,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz L$copy movq 8(%rsp,%r9,8),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$mul_epilogue: .byte 0xf3,0xc3 + .p2align 5 bn_mul4x_mont_gather5: -L$mul4x_enter: + .byte 0x67 movq %rsp,%rax + +L$mul4x_enter: pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +L$mul4x_prologue: + .byte 0x67 shll $3,%r9d leaq (%r9,%r9,2),%r10 @@ -425,46 +479,73 @@ L$mul4x_enter: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$mul4xsp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp L$mul4xsp_done .p2align 5 L$mul4xsp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp L$mul4xsp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mul4x_page_walk + jmp L$mul4x_page_walk_done + +L$mul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mul4x_page_walk +L$mul4x_page_walk_done: + negq %r9 movq %rax,40(%rsp) + L$mul4x_body: call mul4x_internal movq 40(%rsp),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$mul4x_epilogue: .byte 0xf3,0xc3 + .p2align 5 mul4x_internal: shlq $5,%r9 @@ -994,14 +1075,23 @@ L$inner4x: .p2align 5 _bn_power5: + movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +L$power5_prologue: + shll $3,%r9d leal (%r9,%r9,2),%r10d negq %r9 @@ -1015,24 +1105,41 @@ _bn_power5: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$pwr_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp L$pwr_sp_done .p2align 5 L$pwr_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp L$pwr_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwr_page_walk + jmp L$pwr_page_walk_done + +L$pwr_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwr_page_walk +L$pwr_page_walk_done: + movq %r9,%r10 negq %r9 @@ -1047,6 +1154,7 @@ L$pwr_sp_done: movq %r8,32(%rsp) movq %rax,40(%rsp) + L$power5_body: .byte 102,72,15,110,207 .byte 102,72,15,110,209 @@ -1073,18 +1181,27 @@ L$power5_body: call mul4x_internal movq 40(%rsp),%rsi + movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$power5_epilogue: .byte 0xf3,0xc3 + .globl _bn_sqr8x_internal .private_extern _bn_sqr8x_internal .private_extern _bn_sqr8x_internal @@ -1825,6 +1942,7 @@ L$8x_tail: .p2align 5 L$8x_tail_done: + xorq %rax,%rax addq (%rdx),%r8 adcq $0,%r9 adcq $0,%r10 @@ -1833,9 +1951,7 @@ L$8x_tail_done: adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 - - - xorq %rax,%rax + adcq $0,%rax negq %rsi L$8x_no_tail: @@ -1936,15 +2052,24 @@ _bn_from_montgomery: .p2align 5 bn_from_mont8x: + .byte 0x67 movq %rsp,%rax + pushq %rbx + pushq %rbp + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 +L$from_prologue: + shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 @@ -1958,24 +2083,41 @@ bn_from_mont8x: leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb L$from_sp_alt - subq %r11,%rsp - leaq -320(%rsp,%r9,2),%rsp + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp jmp L$from_sp_done .p2align 5 L$from_sp_alt: leaq 4096-320(,%r9,2),%r10 - leaq -320(%rsp,%r9,2),%rsp + leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 - subq %r11,%rsp + subq %r11,%rbp L$from_sp_done: - andq $-64,%rsp + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$from_page_walk + jmp L$from_page_walk_done + +L$from_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$from_page_walk +L$from_page_walk_done: + movq %r9,%r10 negq %r9 @@ -1990,6 +2132,7 @@ L$from_sp_done: movq %r8,32(%rsp) movq %rax,40(%rsp) + L$from_body: movq %r9,%r11 leaq 48(%rsp),%rax @@ -2025,11 +2168,12 @@ L$mul_by_1: pxor %xmm0,%xmm0 leaq 48(%rsp),%rax - movq 40(%rsp),%rsi jmp L$from_mont_zero .p2align 5 L$from_mont_zero: + movq 40(%rsp),%rsi + movdqa %xmm0,0(%rax) movdqa %xmm0,16(%rax) movdqa %xmm0,32(%rax) @@ -2040,15 +2184,23 @@ L$from_mont_zero: movq $1,%rax movq -48(%rsi),%r15 + movq -40(%rsi),%r14 + movq -32(%rsi),%r13 + movq -24(%rsi),%r12 + movq -16(%rsi),%rbp + movq -8(%rsi),%rbx + leaq (%rsi),%rsp + L$from_epilogue: .byte 0xf3,0xc3 + .globl _bn_scatter5 .private_extern _bn_scatter5 diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/modes/aesni-gcm-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/modes/aesni-gcm-x86_64.S deleted file mode 100644 index 21d5ad67e0..0000000000 --- a/packager/third_party/boringssl/mac-x86_64/crypto/modes/aesni-gcm-x86_64.S +++ /dev/null @@ -1,19 +0,0 @@ -#if defined(__x86_64__) -.text - -.globl _aesni_gcm_encrypt -.private_extern _aesni_gcm_encrypt - -_aesni_gcm_encrypt: - xorl %eax,%eax - .byte 0xf3,0xc3 - - -.globl _aesni_gcm_decrypt -.private_extern _aesni_gcm_decrypt - -_aesni_gcm_decrypt: - xorl %eax,%eax - .byte 0xf3,0xc3 - -#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/rc4/rc4-md5-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/rc4/rc4-md5-x86_64.S deleted file mode 100644 index 31ee7d26df..0000000000 --- a/packager/third_party/boringssl/mac-x86_64/crypto/rc4/rc4-md5-x86_64.S +++ /dev/null @@ -1,1262 +0,0 @@ -#if defined(__x86_64__) -.text -.p2align 4 - -.globl _rc4_md5_enc -.private_extern _rc4_md5_enc - -_rc4_md5_enc: - cmpq $0,%r9 - je L$abort - pushq %rbx - pushq %rbp - pushq %r12 - pushq %r13 - pushq %r14 - pushq %r15 - subq $40,%rsp -L$body: - movq %rcx,%r11 - movq %r9,%r12 - movq %rsi,%r13 - movq %rdx,%r14 - movq %r8,%r15 - xorq %rbp,%rbp - xorq %rcx,%rcx - - leaq 8(%rdi),%rdi - movb -8(%rdi),%bpl - movb -4(%rdi),%cl - - incb %bpl - subq %r13,%r14 - movl (%rdi,%rbp,4),%eax - addb %al,%cl - leaq (%rdi,%rbp,4),%rsi - shlq $6,%r12 - addq %r15,%r12 - movq %r12,16(%rsp) - - movq %r11,24(%rsp) - movl 0(%r11),%r8d - movl 4(%r11),%r9d - movl 8(%r11),%r10d - movl 12(%r11),%r11d - jmp L$oop - -.p2align 4 -L$oop: - movl %r8d,0(%rsp) - movl %r9d,4(%rsp) - movl %r10d,8(%rsp) - movl %r11d,%r12d - movl %r11d,12(%rsp) - pxor %xmm0,%xmm0 - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 0(%r15),%r8d - addb %dl,%al - movl 4(%rsi),%ebx - addl $3614090360,%r8d - xorl %r11d,%r12d - movzbl %al,%eax - movl %edx,0(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $7,%r8d - movl %r10d,%r12d - movd (%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - pxor %xmm1,%xmm1 - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 4(%r15),%r11d - addb %dl,%bl - movl 8(%rsi),%eax - addl $3905402710,%r11d - xorl %r10d,%r12d - movzbl %bl,%ebx - movl %edx,4(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $12,%r11d - movl %r9d,%r12d - movd (%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 8(%r15),%r10d - addb %dl,%al - movl 12(%rsi),%ebx - addl $606105819,%r10d - xorl %r9d,%r12d - movzbl %al,%eax - movl %edx,8(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $17,%r10d - movl %r8d,%r12d - pinsrw $1,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 12(%r15),%r9d - addb %dl,%bl - movl 16(%rsi),%eax - addl $3250441966,%r9d - xorl %r8d,%r12d - movzbl %bl,%ebx - movl %edx,12(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $22,%r9d - movl %r11d,%r12d - pinsrw $1,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 16(%r15),%r8d - addb %dl,%al - movl 20(%rsi),%ebx - addl $4118548399,%r8d - xorl %r11d,%r12d - movzbl %al,%eax - movl %edx,16(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $7,%r8d - movl %r10d,%r12d - pinsrw $2,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 20(%r15),%r11d - addb %dl,%bl - movl 24(%rsi),%eax - addl $1200080426,%r11d - xorl %r10d,%r12d - movzbl %bl,%ebx - movl %edx,20(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $12,%r11d - movl %r9d,%r12d - pinsrw $2,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 24(%r15),%r10d - addb %dl,%al - movl 28(%rsi),%ebx - addl $2821735955,%r10d - xorl %r9d,%r12d - movzbl %al,%eax - movl %edx,24(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $17,%r10d - movl %r8d,%r12d - pinsrw $3,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 28(%r15),%r9d - addb %dl,%bl - movl 32(%rsi),%eax - addl $4249261313,%r9d - xorl %r8d,%r12d - movzbl %bl,%ebx - movl %edx,28(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $22,%r9d - movl %r11d,%r12d - pinsrw $3,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 32(%r15),%r8d - addb %dl,%al - movl 36(%rsi),%ebx - addl $1770035416,%r8d - xorl %r11d,%r12d - movzbl %al,%eax - movl %edx,32(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $7,%r8d - movl %r10d,%r12d - pinsrw $4,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 36(%r15),%r11d - addb %dl,%bl - movl 40(%rsi),%eax - addl $2336552879,%r11d - xorl %r10d,%r12d - movzbl %bl,%ebx - movl %edx,36(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $12,%r11d - movl %r9d,%r12d - pinsrw $4,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 40(%r15),%r10d - addb %dl,%al - movl 44(%rsi),%ebx - addl $4294925233,%r10d - xorl %r9d,%r12d - movzbl %al,%eax - movl %edx,40(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $17,%r10d - movl %r8d,%r12d - pinsrw $5,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 44(%r15),%r9d - addb %dl,%bl - movl 48(%rsi),%eax - addl $2304563134,%r9d - xorl %r8d,%r12d - movzbl %bl,%ebx - movl %edx,44(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $22,%r9d - movl %r11d,%r12d - pinsrw $5,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 48(%r15),%r8d - addb %dl,%al - movl 52(%rsi),%ebx - addl $1804603682,%r8d - xorl %r11d,%r12d - movzbl %al,%eax - movl %edx,48(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $7,%r8d - movl %r10d,%r12d - pinsrw $6,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 52(%r15),%r11d - addb %dl,%bl - movl 56(%rsi),%eax - addl $4254626195,%r11d - xorl %r10d,%r12d - movzbl %bl,%ebx - movl %edx,52(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $12,%r11d - movl %r9d,%r12d - pinsrw $6,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 56(%r15),%r10d - addb %dl,%al - movl 60(%rsi),%ebx - addl $2792965006,%r10d - xorl %r9d,%r12d - movzbl %al,%eax - movl %edx,56(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $17,%r10d - movl %r8d,%r12d - pinsrw $7,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movdqu (%r13),%xmm2 - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 60(%r15),%r9d - addb %dl,%bl - movl 64(%rsi),%eax - addl $1236535329,%r9d - xorl %r8d,%r12d - movzbl %bl,%ebx - movl %edx,60(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $22,%r9d - movl %r10d,%r12d - pinsrw $7,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - psllq $8,%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm1,%xmm2 - pxor %xmm0,%xmm0 - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 4(%r15),%r8d - addb %dl,%al - movl 68(%rsi),%ebx - addl $4129170786,%r8d - xorl %r10d,%r12d - movzbl %al,%eax - movl %edx,64(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $5,%r8d - movl %r9d,%r12d - movd (%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - pxor %xmm1,%xmm1 - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 24(%r15),%r11d - addb %dl,%bl - movl 72(%rsi),%eax - addl $3225465664,%r11d - xorl %r9d,%r12d - movzbl %bl,%ebx - movl %edx,68(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $9,%r11d - movl %r8d,%r12d - movd (%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 44(%r15),%r10d - addb %dl,%al - movl 76(%rsi),%ebx - addl $643717713,%r10d - xorl %r8d,%r12d - movzbl %al,%eax - movl %edx,72(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $14,%r10d - movl %r11d,%r12d - pinsrw $1,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 0(%r15),%r9d - addb %dl,%bl - movl 80(%rsi),%eax - addl $3921069994,%r9d - xorl %r11d,%r12d - movzbl %bl,%ebx - movl %edx,76(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $20,%r9d - movl %r10d,%r12d - pinsrw $1,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 20(%r15),%r8d - addb %dl,%al - movl 84(%rsi),%ebx - addl $3593408605,%r8d - xorl %r10d,%r12d - movzbl %al,%eax - movl %edx,80(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $5,%r8d - movl %r9d,%r12d - pinsrw $2,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 40(%r15),%r11d - addb %dl,%bl - movl 88(%rsi),%eax - addl $38016083,%r11d - xorl %r9d,%r12d - movzbl %bl,%ebx - movl %edx,84(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $9,%r11d - movl %r8d,%r12d - pinsrw $2,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 60(%r15),%r10d - addb %dl,%al - movl 92(%rsi),%ebx - addl $3634488961,%r10d - xorl %r8d,%r12d - movzbl %al,%eax - movl %edx,88(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $14,%r10d - movl %r11d,%r12d - pinsrw $3,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 16(%r15),%r9d - addb %dl,%bl - movl 96(%rsi),%eax - addl $3889429448,%r9d - xorl %r11d,%r12d - movzbl %bl,%ebx - movl %edx,92(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $20,%r9d - movl %r10d,%r12d - pinsrw $3,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 36(%r15),%r8d - addb %dl,%al - movl 100(%rsi),%ebx - addl $568446438,%r8d - xorl %r10d,%r12d - movzbl %al,%eax - movl %edx,96(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $5,%r8d - movl %r9d,%r12d - pinsrw $4,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 56(%r15),%r11d - addb %dl,%bl - movl 104(%rsi),%eax - addl $3275163606,%r11d - xorl %r9d,%r12d - movzbl %bl,%ebx - movl %edx,100(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $9,%r11d - movl %r8d,%r12d - pinsrw $4,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 12(%r15),%r10d - addb %dl,%al - movl 108(%rsi),%ebx - addl $4107603335,%r10d - xorl %r8d,%r12d - movzbl %al,%eax - movl %edx,104(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $14,%r10d - movl %r11d,%r12d - pinsrw $5,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 32(%r15),%r9d - addb %dl,%bl - movl 112(%rsi),%eax - addl $1163531501,%r9d - xorl %r11d,%r12d - movzbl %bl,%ebx - movl %edx,108(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $20,%r9d - movl %r10d,%r12d - pinsrw $5,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r11d,%r12d - addl 52(%r15),%r8d - addb %dl,%al - movl 116(%rsi),%ebx - addl $2850285829,%r8d - xorl %r10d,%r12d - movzbl %al,%eax - movl %edx,112(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $5,%r8d - movl %r9d,%r12d - pinsrw $6,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r10d,%r12d - addl 8(%r15),%r11d - addb %dl,%bl - movl 120(%rsi),%eax - addl $4243563512,%r11d - xorl %r9d,%r12d - movzbl %bl,%ebx - movl %edx,116(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $9,%r11d - movl %r8d,%r12d - pinsrw $6,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - andl %r9d,%r12d - addl 28(%r15),%r10d - addb %dl,%al - movl 124(%rsi),%ebx - addl $1735328473,%r10d - xorl %r8d,%r12d - movzbl %al,%eax - movl %edx,120(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $14,%r10d - movl %r11d,%r12d - pinsrw $7,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movdqu 16(%r13),%xmm3 - addb $32,%bpl - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - andl %r8d,%r12d - addl 48(%r15),%r9d - addb %dl,%bl - movl 0(%rdi,%rbp,4),%eax - addl $2368359562,%r9d - xorl %r11d,%r12d - movzbl %bl,%ebx - movl %edx,124(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $20,%r9d - movl %r11d,%r12d - pinsrw $7,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movq %rcx,%rsi - xorq %rcx,%rcx - movb %sil,%cl - leaq (%rdi,%rbp,4),%rsi - psllq $8,%xmm1 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - pxor %xmm0,%xmm0 - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r9d,%r12d - addl 20(%r15),%r8d - addb %dl,%al - movl 4(%rsi),%ebx - addl $4294588738,%r8d - movzbl %al,%eax - addl %r12d,%r8d - movl %edx,0(%rsi) - addb %bl,%cl - roll $4,%r8d - movl %r10d,%r12d - movd (%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - pxor %xmm1,%xmm1 - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r8d,%r12d - addl 32(%r15),%r11d - addb %dl,%bl - movl 8(%rsi),%eax - addl $2272392833,%r11d - movzbl %bl,%ebx - addl %r12d,%r11d - movl %edx,4(%rsi) - addb %al,%cl - roll $11,%r11d - movl %r9d,%r12d - movd (%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r11d,%r12d - addl 44(%r15),%r10d - addb %dl,%al - movl 12(%rsi),%ebx - addl $1839030562,%r10d - movzbl %al,%eax - addl %r12d,%r10d - movl %edx,8(%rsi) - addb %bl,%cl - roll $16,%r10d - movl %r8d,%r12d - pinsrw $1,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r10d,%r12d - addl 56(%r15),%r9d - addb %dl,%bl - movl 16(%rsi),%eax - addl $4259657740,%r9d - movzbl %bl,%ebx - addl %r12d,%r9d - movl %edx,12(%rsi) - addb %al,%cl - roll $23,%r9d - movl %r11d,%r12d - pinsrw $1,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r9d,%r12d - addl 4(%r15),%r8d - addb %dl,%al - movl 20(%rsi),%ebx - addl $2763975236,%r8d - movzbl %al,%eax - addl %r12d,%r8d - movl %edx,16(%rsi) - addb %bl,%cl - roll $4,%r8d - movl %r10d,%r12d - pinsrw $2,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r8d,%r12d - addl 16(%r15),%r11d - addb %dl,%bl - movl 24(%rsi),%eax - addl $1272893353,%r11d - movzbl %bl,%ebx - addl %r12d,%r11d - movl %edx,20(%rsi) - addb %al,%cl - roll $11,%r11d - movl %r9d,%r12d - pinsrw $2,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r11d,%r12d - addl 28(%r15),%r10d - addb %dl,%al - movl 28(%rsi),%ebx - addl $4139469664,%r10d - movzbl %al,%eax - addl %r12d,%r10d - movl %edx,24(%rsi) - addb %bl,%cl - roll $16,%r10d - movl %r8d,%r12d - pinsrw $3,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r10d,%r12d - addl 40(%r15),%r9d - addb %dl,%bl - movl 32(%rsi),%eax - addl $3200236656,%r9d - movzbl %bl,%ebx - addl %r12d,%r9d - movl %edx,28(%rsi) - addb %al,%cl - roll $23,%r9d - movl %r11d,%r12d - pinsrw $3,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r9d,%r12d - addl 52(%r15),%r8d - addb %dl,%al - movl 36(%rsi),%ebx - addl $681279174,%r8d - movzbl %al,%eax - addl %r12d,%r8d - movl %edx,32(%rsi) - addb %bl,%cl - roll $4,%r8d - movl %r10d,%r12d - pinsrw $4,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r8d,%r12d - addl 0(%r15),%r11d - addb %dl,%bl - movl 40(%rsi),%eax - addl $3936430074,%r11d - movzbl %bl,%ebx - addl %r12d,%r11d - movl %edx,36(%rsi) - addb %al,%cl - roll $11,%r11d - movl %r9d,%r12d - pinsrw $4,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r11d,%r12d - addl 12(%r15),%r10d - addb %dl,%al - movl 44(%rsi),%ebx - addl $3572445317,%r10d - movzbl %al,%eax - addl %r12d,%r10d - movl %edx,40(%rsi) - addb %bl,%cl - roll $16,%r10d - movl %r8d,%r12d - pinsrw $5,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r10d,%r12d - addl 24(%r15),%r9d - addb %dl,%bl - movl 48(%rsi),%eax - addl $76029189,%r9d - movzbl %bl,%ebx - addl %r12d,%r9d - movl %edx,44(%rsi) - addb %al,%cl - roll $23,%r9d - movl %r11d,%r12d - pinsrw $5,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r9d,%r12d - addl 36(%r15),%r8d - addb %dl,%al - movl 52(%rsi),%ebx - addl $3654602809,%r8d - movzbl %al,%eax - addl %r12d,%r8d - movl %edx,48(%rsi) - addb %bl,%cl - roll $4,%r8d - movl %r10d,%r12d - pinsrw $6,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r8d,%r12d - addl 48(%r15),%r11d - addb %dl,%bl - movl 56(%rsi),%eax - addl $3873151461,%r11d - movzbl %bl,%ebx - addl %r12d,%r11d - movl %edx,52(%rsi) - addb %al,%cl - roll $11,%r11d - movl %r9d,%r12d - pinsrw $6,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %eax,(%rdi,%rcx,4) - xorl %r11d,%r12d - addl 60(%r15),%r10d - addb %dl,%al - movl 60(%rsi),%ebx - addl $530742520,%r10d - movzbl %al,%eax - addl %r12d,%r10d - movl %edx,56(%rsi) - addb %bl,%cl - roll $16,%r10d - movl %r8d,%r12d - pinsrw $7,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movdqu 32(%r13),%xmm4 - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %ebx,(%rdi,%rcx,4) - xorl %r10d,%r12d - addl 8(%r15),%r9d - addb %dl,%bl - movl 64(%rsi),%eax - addl $3299628645,%r9d - movzbl %bl,%ebx - addl %r12d,%r9d - movl %edx,60(%rsi) - addb %al,%cl - roll $23,%r9d - movl $-1,%r12d - pinsrw $7,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - psllq $8,%xmm1 - pxor %xmm0,%xmm4 - pxor %xmm1,%xmm4 - pxor %xmm0,%xmm0 - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r9d,%r12d - addl 0(%r15),%r8d - addb %dl,%al - movl 68(%rsi),%ebx - addl $4096336452,%r8d - movzbl %al,%eax - xorl %r10d,%r12d - movl %edx,64(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $6,%r8d - movl $-1,%r12d - movd (%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - pxor %xmm1,%xmm1 - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r8d,%r12d - addl 28(%r15),%r11d - addb %dl,%bl - movl 72(%rsi),%eax - addl $1126891415,%r11d - movzbl %bl,%ebx - xorl %r9d,%r12d - movl %edx,68(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $10,%r11d - movl $-1,%r12d - movd (%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r11d,%r12d - addl 56(%r15),%r10d - addb %dl,%al - movl 76(%rsi),%ebx - addl $2878612391,%r10d - movzbl %al,%eax - xorl %r8d,%r12d - movl %edx,72(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $15,%r10d - movl $-1,%r12d - pinsrw $1,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r10d,%r12d - addl 20(%r15),%r9d - addb %dl,%bl - movl 80(%rsi),%eax - addl $4237533241,%r9d - movzbl %bl,%ebx - xorl %r11d,%r12d - movl %edx,76(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $21,%r9d - movl $-1,%r12d - pinsrw $1,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r9d,%r12d - addl 48(%r15),%r8d - addb %dl,%al - movl 84(%rsi),%ebx - addl $1700485571,%r8d - movzbl %al,%eax - xorl %r10d,%r12d - movl %edx,80(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $6,%r8d - movl $-1,%r12d - pinsrw $2,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r8d,%r12d - addl 12(%r15),%r11d - addb %dl,%bl - movl 88(%rsi),%eax - addl $2399980690,%r11d - movzbl %bl,%ebx - xorl %r9d,%r12d - movl %edx,84(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $10,%r11d - movl $-1,%r12d - pinsrw $2,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r11d,%r12d - addl 40(%r15),%r10d - addb %dl,%al - movl 92(%rsi),%ebx - addl $4293915773,%r10d - movzbl %al,%eax - xorl %r8d,%r12d - movl %edx,88(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $15,%r10d - movl $-1,%r12d - pinsrw $3,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r10d,%r12d - addl 4(%r15),%r9d - addb %dl,%bl - movl 96(%rsi),%eax - addl $2240044497,%r9d - movzbl %bl,%ebx - xorl %r11d,%r12d - movl %edx,92(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $21,%r9d - movl $-1,%r12d - pinsrw $3,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r9d,%r12d - addl 32(%r15),%r8d - addb %dl,%al - movl 100(%rsi),%ebx - addl $1873313359,%r8d - movzbl %al,%eax - xorl %r10d,%r12d - movl %edx,96(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $6,%r8d - movl $-1,%r12d - pinsrw $4,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r8d,%r12d - addl 60(%r15),%r11d - addb %dl,%bl - movl 104(%rsi),%eax - addl $4264355552,%r11d - movzbl %bl,%ebx - xorl %r9d,%r12d - movl %edx,100(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $10,%r11d - movl $-1,%r12d - pinsrw $4,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r11d,%r12d - addl 24(%r15),%r10d - addb %dl,%al - movl 108(%rsi),%ebx - addl $2734768916,%r10d - movzbl %al,%eax - xorl %r8d,%r12d - movl %edx,104(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $15,%r10d - movl $-1,%r12d - pinsrw $5,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r10d,%r12d - addl 52(%r15),%r9d - addb %dl,%bl - movl 112(%rsi),%eax - addl $1309151649,%r9d - movzbl %bl,%ebx - xorl %r11d,%r12d - movl %edx,108(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $21,%r9d - movl $-1,%r12d - pinsrw $5,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movl (%rdi,%rcx,4),%edx - xorl %r11d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r9d,%r12d - addl 16(%r15),%r8d - addb %dl,%al - movl 116(%rsi),%ebx - addl $4149444226,%r8d - movzbl %al,%eax - xorl %r10d,%r12d - movl %edx,112(%rsi) - addl %r12d,%r8d - addb %bl,%cl - roll $6,%r8d - movl $-1,%r12d - pinsrw $6,(%rdi,%rax,4),%xmm0 - - addl %r9d,%r8d - movl (%rdi,%rcx,4),%edx - xorl %r10d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r8d,%r12d - addl 44(%r15),%r11d - addb %dl,%bl - movl 120(%rsi),%eax - addl $3174756917,%r11d - movzbl %bl,%ebx - xorl %r9d,%r12d - movl %edx,116(%rsi) - addl %r12d,%r11d - addb %al,%cl - roll $10,%r11d - movl $-1,%r12d - pinsrw $6,(%rdi,%rbx,4),%xmm1 - - addl %r8d,%r11d - movl (%rdi,%rcx,4),%edx - xorl %r9d,%r12d - movl %eax,(%rdi,%rcx,4) - orl %r11d,%r12d - addl 8(%r15),%r10d - addb %dl,%al - movl 124(%rsi),%ebx - addl $718787259,%r10d - movzbl %al,%eax - xorl %r8d,%r12d - movl %edx,120(%rsi) - addl %r12d,%r10d - addb %bl,%cl - roll $15,%r10d - movl $-1,%r12d - pinsrw $7,(%rdi,%rax,4),%xmm0 - - addl %r11d,%r10d - movdqu 48(%r13),%xmm5 - addb $32,%bpl - movl (%rdi,%rcx,4),%edx - xorl %r8d,%r12d - movl %ebx,(%rdi,%rcx,4) - orl %r10d,%r12d - addl 36(%r15),%r9d - addb %dl,%bl - movl 0(%rdi,%rbp,4),%eax - addl $3951481745,%r9d - movzbl %bl,%ebx - xorl %r11d,%r12d - movl %edx,124(%rsi) - addl %r12d,%r9d - addb %al,%cl - roll $21,%r9d - movl $-1,%r12d - pinsrw $7,(%rdi,%rbx,4),%xmm1 - - addl %r10d,%r9d - movq %rbp,%rsi - xorq %rbp,%rbp - movb %sil,%bpl - movq %rcx,%rsi - xorq %rcx,%rcx - movb %sil,%cl - leaq (%rdi,%rbp,4),%rsi - psllq $8,%xmm1 - pxor %xmm0,%xmm5 - pxor %xmm1,%xmm5 - addl 0(%rsp),%r8d - addl 4(%rsp),%r9d - addl 8(%rsp),%r10d - addl 12(%rsp),%r11d - - movdqu %xmm2,(%r14,%r13,1) - movdqu %xmm3,16(%r14,%r13,1) - movdqu %xmm4,32(%r14,%r13,1) - movdqu %xmm5,48(%r14,%r13,1) - leaq 64(%r15),%r15 - leaq 64(%r13),%r13 - cmpq 16(%rsp),%r15 - jb L$oop - - movq 24(%rsp),%r12 - subb %al,%cl - movl %r8d,0(%r12) - movl %r9d,4(%r12) - movl %r10d,8(%r12) - movl %r11d,12(%r12) - subb $1,%bpl - movl %ebp,-8(%rdi) - movl %ecx,-4(%rdi) - - movq 40(%rsp),%r15 - movq 48(%rsp),%r14 - movq 56(%rsp),%r13 - movq 64(%rsp),%r12 - movq 72(%rsp),%rbp - movq 80(%rsp),%rbx - leaq 88(%rsp),%rsp -L$epilogue: -L$abort: - .byte 0xf3,0xc3 - -#endif diff --git a/packager/third_party/boringssl/mac-x86_64/crypto/rc4/rc4-x86_64.S b/packager/third_party/boringssl/mac-x86_64/crypto/rc4/rc4-x86_64.S deleted file mode 100644 index 780818476c..0000000000 --- a/packager/third_party/boringssl/mac-x86_64/crypto/rc4/rc4-x86_64.S +++ /dev/null @@ -1,595 +0,0 @@ -#if defined(__x86_64__) -.text - - -.globl _asm_RC4 -.private_extern _asm_RC4 - -.p2align 4 -_asm_RC4: - orq %rsi,%rsi - jne L$entry - .byte 0xf3,0xc3 -L$entry: - pushq %rbx - pushq %r12 - pushq %r13 -L$prologue: - movq %rsi,%r11 - movq %rdx,%r12 - movq %rcx,%r13 - xorq %r10,%r10 - xorq %rcx,%rcx - - leaq 8(%rdi),%rdi - movb -8(%rdi),%r10b - movb -4(%rdi),%cl - cmpl $-1,256(%rdi) - je L$RC4_CHAR - movl _OPENSSL_ia32cap_P(%rip),%r8d - xorq %rbx,%rbx - incb %r10b - subq %r10,%rbx - subq %r12,%r13 - movl (%rdi,%r10,4),%eax - testq $-16,%r11 - jz L$loop1 - btl $30,%r8d - jc L$intel - andq $7,%rbx - leaq 1(%r10),%rsi - jz L$oop8 - subq %rbx,%r11 -L$oop8_warmup: - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl %edx,(%rdi,%r10,4) - addb %dl,%al - incb %r10b - movl (%rdi,%rax,4),%edx - movl (%rdi,%r10,4),%eax - xorb (%r12),%dl - movb %dl,(%r12,%r13,1) - leaq 1(%r12),%r12 - decq %rbx - jnz L$oop8_warmup - - leaq 1(%r10),%rsi - jmp L$oop8 -.p2align 4 -L$oop8: - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl 0(%rdi,%rsi,4),%ebx - rorq $8,%r8 - movl %edx,0(%rdi,%r10,4) - addb %al,%dl - movb (%rdi,%rdx,4),%r8b - addb %bl,%cl - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - movl 4(%rdi,%rsi,4),%eax - rorq $8,%r8 - movl %edx,4(%rdi,%r10,4) - addb %bl,%dl - movb (%rdi,%rdx,4),%r8b - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl 8(%rdi,%rsi,4),%ebx - rorq $8,%r8 - movl %edx,8(%rdi,%r10,4) - addb %al,%dl - movb (%rdi,%rdx,4),%r8b - addb %bl,%cl - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - movl 12(%rdi,%rsi,4),%eax - rorq $8,%r8 - movl %edx,12(%rdi,%r10,4) - addb %bl,%dl - movb (%rdi,%rdx,4),%r8b - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl 16(%rdi,%rsi,4),%ebx - rorq $8,%r8 - movl %edx,16(%rdi,%r10,4) - addb %al,%dl - movb (%rdi,%rdx,4),%r8b - addb %bl,%cl - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - movl 20(%rdi,%rsi,4),%eax - rorq $8,%r8 - movl %edx,20(%rdi,%r10,4) - addb %bl,%dl - movb (%rdi,%rdx,4),%r8b - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl 24(%rdi,%rsi,4),%ebx - rorq $8,%r8 - movl %edx,24(%rdi,%r10,4) - addb %al,%dl - movb (%rdi,%rdx,4),%r8b - addb $8,%sil - addb %bl,%cl - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - movl -4(%rdi,%rsi,4),%eax - rorq $8,%r8 - movl %edx,28(%rdi,%r10,4) - addb %bl,%dl - movb (%rdi,%rdx,4),%r8b - addb $8,%r10b - rorq $8,%r8 - subq $8,%r11 - - xorq (%r12),%r8 - movq %r8,(%r12,%r13,1) - leaq 8(%r12),%r12 - - testq $-8,%r11 - jnz L$oop8 - cmpq $0,%r11 - jne L$loop1 - jmp L$exit - -.p2align 4 -L$intel: - testq $-32,%r11 - jz L$loop1 - andq $15,%rbx - jz L$oop16_is_hot - subq %rbx,%r11 -L$oop16_warmup: - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl %edx,(%rdi,%r10,4) - addb %dl,%al - incb %r10b - movl (%rdi,%rax,4),%edx - movl (%rdi,%r10,4),%eax - xorb (%r12),%dl - movb %dl,(%r12,%r13,1) - leaq 1(%r12),%r12 - decq %rbx - jnz L$oop16_warmup - - movq %rcx,%rbx - xorq %rcx,%rcx - movb %bl,%cl - -L$oop16_is_hot: - leaq (%rdi,%r10,4),%rsi - addb %al,%cl - movl (%rdi,%rcx,4),%edx - pxor %xmm0,%xmm0 - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 4(%rsi),%ebx - movzbl %al,%eax - movl %edx,0(%rsi) - addb %bl,%cl - pinsrw $0,(%rdi,%rax,4),%xmm0 - jmp L$oop16_enter -.p2align 4 -L$oop16: - addb %al,%cl - movl (%rdi,%rcx,4),%edx - pxor %xmm0,%xmm2 - psllq $8,%xmm1 - pxor %xmm0,%xmm0 - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 4(%rsi),%ebx - movzbl %al,%eax - movl %edx,0(%rsi) - pxor %xmm1,%xmm2 - addb %bl,%cl - pinsrw $0,(%rdi,%rax,4),%xmm0 - movdqu %xmm2,(%r12,%r13,1) - leaq 16(%r12),%r12 -L$oop16_enter: - movl (%rdi,%rcx,4),%edx - pxor %xmm1,%xmm1 - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 8(%rsi),%eax - movzbl %bl,%ebx - movl %edx,4(%rsi) - addb %al,%cl - pinsrw $0,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 12(%rsi),%ebx - movzbl %al,%eax - movl %edx,8(%rsi) - addb %bl,%cl - pinsrw $1,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 16(%rsi),%eax - movzbl %bl,%ebx - movl %edx,12(%rsi) - addb %al,%cl - pinsrw $1,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 20(%rsi),%ebx - movzbl %al,%eax - movl %edx,16(%rsi) - addb %bl,%cl - pinsrw $2,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 24(%rsi),%eax - movzbl %bl,%ebx - movl %edx,20(%rsi) - addb %al,%cl - pinsrw $2,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 28(%rsi),%ebx - movzbl %al,%eax - movl %edx,24(%rsi) - addb %bl,%cl - pinsrw $3,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 32(%rsi),%eax - movzbl %bl,%ebx - movl %edx,28(%rsi) - addb %al,%cl - pinsrw $3,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 36(%rsi),%ebx - movzbl %al,%eax - movl %edx,32(%rsi) - addb %bl,%cl - pinsrw $4,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 40(%rsi),%eax - movzbl %bl,%ebx - movl %edx,36(%rsi) - addb %al,%cl - pinsrw $4,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 44(%rsi),%ebx - movzbl %al,%eax - movl %edx,40(%rsi) - addb %bl,%cl - pinsrw $5,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 48(%rsi),%eax - movzbl %bl,%ebx - movl %edx,44(%rsi) - addb %al,%cl - pinsrw $5,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 52(%rsi),%ebx - movzbl %al,%eax - movl %edx,48(%rsi) - addb %bl,%cl - pinsrw $6,(%rdi,%rax,4),%xmm0 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movl 56(%rsi),%eax - movzbl %bl,%ebx - movl %edx,52(%rsi) - addb %al,%cl - pinsrw $6,(%rdi,%rbx,4),%xmm1 - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - addb %dl,%al - movl 60(%rsi),%ebx - movzbl %al,%eax - movl %edx,56(%rsi) - addb %bl,%cl - pinsrw $7,(%rdi,%rax,4),%xmm0 - addb $16,%r10b - movdqu (%r12),%xmm2 - movl (%rdi,%rcx,4),%edx - movl %ebx,(%rdi,%rcx,4) - addb %dl,%bl - movzbl %bl,%ebx - movl %edx,60(%rsi) - leaq (%rdi,%r10,4),%rsi - pinsrw $7,(%rdi,%rbx,4),%xmm1 - movl (%rsi),%eax - movq %rcx,%rbx - xorq %rcx,%rcx - subq $16,%r11 - movb %bl,%cl - testq $-16,%r11 - jnz L$oop16 - - psllq $8,%xmm1 - pxor %xmm0,%xmm2 - pxor %xmm1,%xmm2 - movdqu %xmm2,(%r12,%r13,1) - leaq 16(%r12),%r12 - - cmpq $0,%r11 - jne L$loop1 - jmp L$exit - -.p2align 4 -L$loop1: - addb %al,%cl - movl (%rdi,%rcx,4),%edx - movl %eax,(%rdi,%rcx,4) - movl %edx,(%rdi,%r10,4) - addb %dl,%al - incb %r10b - movl (%rdi,%rax,4),%edx - movl (%rdi,%r10,4),%eax - xorb (%r12),%dl - movb %dl,(%r12,%r13,1) - leaq 1(%r12),%r12 - decq %r11 - jnz L$loop1 - jmp L$exit - -.p2align 4 -L$RC4_CHAR: - addb $1,%r10b - movzbl (%rdi,%r10,1),%eax - testq $-8,%r11 - jz L$cloop1 - jmp L$cloop8 -.p2align 4 -L$cloop8: - movl (%r12),%r8d - movl 4(%r12),%r9d - addb %al,%cl - leaq 1(%r10),%rsi - movzbl (%rdi,%rcx,1),%edx - movzbl %sil,%esi - movzbl (%rdi,%rsi,1),%ebx - movb %al,(%rdi,%rcx,1) - cmpq %rsi,%rcx - movb %dl,(%rdi,%r10,1) - jne L$cmov0 - movq %rax,%rbx -L$cmov0: - addb %al,%dl - xorb (%rdi,%rdx,1),%r8b - rorl $8,%r8d - addb %bl,%cl - leaq 1(%rsi),%r10 - movzbl (%rdi,%rcx,1),%edx - movzbl %r10b,%r10d - movzbl (%rdi,%r10,1),%eax - movb %bl,(%rdi,%rcx,1) - cmpq %r10,%rcx - movb %dl,(%rdi,%rsi,1) - jne L$cmov1 - movq %rbx,%rax -L$cmov1: - addb %bl,%dl - xorb (%rdi,%rdx,1),%r8b - rorl $8,%r8d - addb %al,%cl - leaq 1(%r10),%rsi - movzbl (%rdi,%rcx,1),%edx - movzbl %sil,%esi - movzbl (%rdi,%rsi,1),%ebx - movb %al,(%rdi,%rcx,1) - cmpq %rsi,%rcx - movb %dl,(%rdi,%r10,1) - jne L$cmov2 - movq %rax,%rbx -L$cmov2: - addb %al,%dl - xorb (%rdi,%rdx,1),%r8b - rorl $8,%r8d - addb %bl,%cl - leaq 1(%rsi),%r10 - movzbl (%rdi,%rcx,1),%edx - movzbl %r10b,%r10d - movzbl (%rdi,%r10,1),%eax - movb %bl,(%rdi,%rcx,1) - cmpq %r10,%rcx - movb %dl,(%rdi,%rsi,1) - jne L$cmov3 - movq %rbx,%rax -L$cmov3: - addb %bl,%dl - xorb (%rdi,%rdx,1),%r8b - rorl $8,%r8d - addb %al,%cl - leaq 1(%r10),%rsi - movzbl (%rdi,%rcx,1),%edx - movzbl %sil,%esi - movzbl (%rdi,%rsi,1),%ebx - movb %al,(%rdi,%rcx,1) - cmpq %rsi,%rcx - movb %dl,(%rdi,%r10,1) - jne L$cmov4 - movq %rax,%rbx -L$cmov4: - addb %al,%dl - xorb (%rdi,%rdx,1),%r9b - rorl $8,%r9d - addb %bl,%cl - leaq 1(%rsi),%r10 - movzbl (%rdi,%rcx,1),%edx - movzbl %r10b,%r10d - movzbl (%rdi,%r10,1),%eax - movb %bl,(%rdi,%rcx,1) - cmpq %r10,%rcx - movb %dl,(%rdi,%rsi,1) - jne L$cmov5 - movq %rbx,%rax -L$cmov5: - addb %bl,%dl - xorb (%rdi,%rdx,1),%r9b - rorl $8,%r9d - addb %al,%cl - leaq 1(%r10),%rsi - movzbl (%rdi,%rcx,1),%edx - movzbl %sil,%esi - movzbl (%rdi,%rsi,1),%ebx - movb %al,(%rdi,%rcx,1) - cmpq %rsi,%rcx - movb %dl,(%rdi,%r10,1) - jne L$cmov6 - movq %rax,%rbx -L$cmov6: - addb %al,%dl - xorb (%rdi,%rdx,1),%r9b - rorl $8,%r9d - addb %bl,%cl - leaq 1(%rsi),%r10 - movzbl (%rdi,%rcx,1),%edx - movzbl %r10b,%r10d - movzbl (%rdi,%r10,1),%eax - movb %bl,(%rdi,%rcx,1) - cmpq %r10,%rcx - movb %dl,(%rdi,%rsi,1) - jne L$cmov7 - movq %rbx,%rax -L$cmov7: - addb %bl,%dl - xorb (%rdi,%rdx,1),%r9b - rorl $8,%r9d - leaq -8(%r11),%r11 - movl %r8d,(%r13) - leaq 8(%r12),%r12 - movl %r9d,4(%r13) - leaq 8(%r13),%r13 - - testq $-8,%r11 - jnz L$cloop8 - cmpq $0,%r11 - jne L$cloop1 - jmp L$exit -.p2align 4 -L$cloop1: - addb %al,%cl - movzbl %cl,%ecx - movzbl (%rdi,%rcx,1),%edx - movb %al,(%rdi,%rcx,1) - movb %dl,(%rdi,%r10,1) - addb %al,%dl - addb $1,%r10b - movzbl %dl,%edx - movzbl %r10b,%r10d - movzbl (%rdi,%rdx,1),%edx - movzbl (%rdi,%r10,1),%eax - xorb (%r12),%dl - leaq 1(%r12),%r12 - movb %dl,(%r13) - leaq 1(%r13),%r13 - subq $1,%r11 - jnz L$cloop1 - jmp L$exit - -.p2align 4 -L$exit: - subb $1,%r10b - movl %r10d,-8(%rdi) - movl %ecx,-4(%rdi) - - movq (%rsp),%r13 - movq 8(%rsp),%r12 - movq 16(%rsp),%rbx - addq $24,%rsp -L$epilogue: - .byte 0xf3,0xc3 - -.globl _asm_RC4_set_key -.private_extern _asm_RC4_set_key - -.p2align 4 -_asm_RC4_set_key: - leaq 8(%rdi),%rdi - leaq (%rdx,%rsi,1),%rdx - negq %rsi - movq %rsi,%rcx - xorl %eax,%eax - xorq %r9,%r9 - xorq %r10,%r10 - xorq %r11,%r11 - - movl _OPENSSL_ia32cap_P(%rip),%r8d - btl $20,%r8d - jc L$c1stloop - jmp L$w1stloop - -.p2align 4 -L$w1stloop: - movl %eax,(%rdi,%rax,4) - addb $1,%al - jnc L$w1stloop - - xorq %r9,%r9 - xorq %r8,%r8 -.p2align 4 -L$w2ndloop: - movl (%rdi,%r9,4),%r10d - addb (%rdx,%rsi,1),%r8b - addb %r10b,%r8b - addq $1,%rsi - movl (%rdi,%r8,4),%r11d - cmovzq %rcx,%rsi - movl %r10d,(%rdi,%r8,4) - movl %r11d,(%rdi,%r9,4) - addb $1,%r9b - jnc L$w2ndloop - jmp L$exit_key - -.p2align 4 -L$c1stloop: - movb %al,(%rdi,%rax,1) - addb $1,%al - jnc L$c1stloop - - xorq %r9,%r9 - xorq %r8,%r8 -.p2align 4 -L$c2ndloop: - movb (%rdi,%r9,1),%r10b - addb (%rdx,%rsi,1),%r8b - addb %r10b,%r8b - addq $1,%rsi - movb (%rdi,%r8,1),%r11b - jnz L$cnowrap - movq %rcx,%rsi -L$cnowrap: - movb %r10b,(%rdi,%r8,1) - movb %r11b,(%rdi,%r9,1) - addb $1,%r9b - jnc L$c2ndloop - movl $-1,256(%rdi) - -.p2align 4 -L$exit_key: - xorl %eax,%eax - movl %eax,-8(%rdi) - movl %eax,-4(%rdi) - .byte 0xf3,0xc3 - -#endif diff --git a/packager/third_party/boringssl/roll_boringssl.py b/packager/third_party/boringssl/roll_boringssl.py index 13fe24c775..41c2ed1e50 100755 --- a/packager/third_party/boringssl/roll_boringssl.py +++ b/packager/third_party/boringssl/roll_boringssl.py @@ -13,13 +13,14 @@ import sys SCRIPT_PATH = os.path.abspath(__file__) -SRC_PATH = os.path.dirname(os.path.dirname(os.path.dirname(SCRIPT_PATH))) +SRC_PATH = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.dirname(SCRIPT_PATH)))) DEPS_PATH = os.path.join(SRC_PATH, 'DEPS') -BORINGSSL_PATH = os.path.join(SRC_PATH, 'third_party', 'boringssl') +BORINGSSL_PATH = os.path.join(SRC_PATH, 'packager', 'third_party', 'boringssl') BORINGSSL_SRC_PATH = os.path.join(BORINGSSL_PATH, 'src') if not os.path.isfile(DEPS_PATH) or not os.path.isdir(BORINGSSL_SRC_PATH): - raise Exception('Could not find Chromium checkout') + raise Exception('Could not find packager checkout') # Pull OS_ARCH_COMBOS out of the BoringSSL script. sys.path.append(os.path.join(BORINGSSL_SRC_PATH, 'util')) @@ -63,7 +64,7 @@ def main(): return 1 if not IsPristine(SRC_PATH): - print >>sys.stderr, 'Chromium checkout not pristine.' + print >>sys.stderr, 'Packager checkout not pristine.' return 0 if not IsPristine(BORINGSSL_SRC_PATH): print >>sys.stderr, 'BoringSSL checkout not pristine.' diff --git a/packager/third_party/boringssl/win-x86/crypto/aes/aes-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aes-586.asm similarity index 100% rename from packager/third_party/boringssl/win-x86/crypto/aes/aes-586.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/aes-586.asm diff --git a/packager/third_party/boringssl/win-x86/crypto/aes/aesni-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/aesni-x86.asm similarity index 100% rename from packager/third_party/boringssl/win-x86/crypto/aes/aesni-x86.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/aesni-x86.asm diff --git a/packager/third_party/boringssl/win-x86/crypto/bn/bn-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/bn-586.asm similarity index 100% rename from packager/third_party/boringssl/win-x86/crypto/bn/bn-586.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/bn-586.asm diff --git a/packager/third_party/boringssl/win-x86/crypto/bn/co-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/co-586.asm similarity index 100% rename from packager/third_party/boringssl/win-x86/crypto/bn/co-586.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/co-586.asm diff --git a/packager/third_party/boringssl/win-x86/crypto/modes/ghash-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm similarity index 79% rename from packager/third_party/boringssl/win-x86/crypto/modes/ghash-x86.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm index eb493aca63..1d350d6a7f 100644 --- a/packager/third_party/boringssl/win-x86/crypto/modes/ghash-x86.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/ghash-x86.asm @@ -14,205 +14,6 @@ section .text code align=64 %else section .text code %endif -global _gcm_gmult_4bit_x86 -align 16 -_gcm_gmult_4bit_x86: -L$_gcm_gmult_4bit_x86_begin: - push ebp - push ebx - push esi - push edi - sub esp,84 - mov edi,DWORD [104+esp] - mov esi,DWORD [108+esp] - mov ebp,DWORD [edi] - mov edx,DWORD [4+edi] - mov ecx,DWORD [8+edi] - mov ebx,DWORD [12+edi] - mov DWORD [16+esp],0 - mov DWORD [20+esp],471859200 - mov DWORD [24+esp],943718400 - mov DWORD [28+esp],610271232 - mov DWORD [32+esp],1887436800 - mov DWORD [36+esp],1822425088 - mov DWORD [40+esp],1220542464 - mov DWORD [44+esp],1423966208 - mov DWORD [48+esp],3774873600 - mov DWORD [52+esp],4246732800 - mov DWORD [56+esp],3644850176 - mov DWORD [60+esp],3311403008 - mov DWORD [64+esp],2441084928 - mov DWORD [68+esp],2376073216 - mov DWORD [72+esp],2847932416 - mov DWORD [76+esp],3051356160 - mov DWORD [esp],ebp - mov DWORD [4+esp],edx - mov DWORD [8+esp],ecx - mov DWORD [12+esp],ebx - shr ebx,20 - and ebx,240 - mov ebp,DWORD [4+ebx*1+esi] - mov edx,DWORD [ebx*1+esi] - mov ecx,DWORD [12+ebx*1+esi] - mov ebx,DWORD [8+ebx*1+esi] - xor eax,eax - mov edi,15 - jmp NEAR L$000x86_loop -align 16 -L$000x86_loop: - mov al,bl - shrd ebx,ecx,4 - and al,15 - shrd ecx,edx,4 - shrd edx,ebp,4 - shr ebp,4 - xor ebp,DWORD [16+eax*4+esp] - mov al,BYTE [edi*1+esp] - and al,240 - xor ebx,DWORD [8+eax*1+esi] - xor ecx,DWORD [12+eax*1+esi] - xor edx,DWORD [eax*1+esi] - xor ebp,DWORD [4+eax*1+esi] - dec edi - js NEAR L$001x86_break - mov al,bl - shrd ebx,ecx,4 - and al,15 - shrd ecx,edx,4 - shrd edx,ebp,4 - shr ebp,4 - xor ebp,DWORD [16+eax*4+esp] - mov al,BYTE [edi*1+esp] - shl al,4 - xor ebx,DWORD [8+eax*1+esi] - xor ecx,DWORD [12+eax*1+esi] - xor edx,DWORD [eax*1+esi] - xor ebp,DWORD [4+eax*1+esi] - jmp NEAR L$000x86_loop -align 16 -L$001x86_break: - bswap ebx - bswap ecx - bswap edx - bswap ebp - mov edi,DWORD [104+esp] - mov DWORD [12+edi],ebx - mov DWORD [8+edi],ecx - mov DWORD [4+edi],edx - mov DWORD [edi],ebp - add esp,84 - pop edi - pop esi - pop ebx - pop ebp - ret -global _gcm_ghash_4bit_x86 -align 16 -_gcm_ghash_4bit_x86: -L$_gcm_ghash_4bit_x86_begin: - push ebp - push ebx - push esi - push edi - sub esp,84 - mov ebx,DWORD [104+esp] - mov esi,DWORD [108+esp] - mov edi,DWORD [112+esp] - mov ecx,DWORD [116+esp] - add ecx,edi - mov DWORD [116+esp],ecx - mov ebp,DWORD [ebx] - mov edx,DWORD [4+ebx] - mov ecx,DWORD [8+ebx] - mov ebx,DWORD [12+ebx] - mov DWORD [16+esp],0 - mov DWORD [20+esp],471859200 - mov DWORD [24+esp],943718400 - mov DWORD [28+esp],610271232 - mov DWORD [32+esp],1887436800 - mov DWORD [36+esp],1822425088 - mov DWORD [40+esp],1220542464 - mov DWORD [44+esp],1423966208 - mov DWORD [48+esp],3774873600 - mov DWORD [52+esp],4246732800 - mov DWORD [56+esp],3644850176 - mov DWORD [60+esp],3311403008 - mov DWORD [64+esp],2441084928 - mov DWORD [68+esp],2376073216 - mov DWORD [72+esp],2847932416 - mov DWORD [76+esp],3051356160 -align 16 -L$002x86_outer_loop: - xor ebx,DWORD [12+edi] - xor ecx,DWORD [8+edi] - xor edx,DWORD [4+edi] - xor ebp,DWORD [edi] - mov DWORD [12+esp],ebx - mov DWORD [8+esp],ecx - mov DWORD [4+esp],edx - mov DWORD [esp],ebp - shr ebx,20 - and ebx,240 - mov ebp,DWORD [4+ebx*1+esi] - mov edx,DWORD [ebx*1+esi] - mov ecx,DWORD [12+ebx*1+esi] - mov ebx,DWORD [8+ebx*1+esi] - xor eax,eax - mov edi,15 - jmp NEAR L$003x86_loop -align 16 -L$003x86_loop: - mov al,bl - shrd ebx,ecx,4 - and al,15 - shrd ecx,edx,4 - shrd edx,ebp,4 - shr ebp,4 - xor ebp,DWORD [16+eax*4+esp] - mov al,BYTE [edi*1+esp] - and al,240 - xor ebx,DWORD [8+eax*1+esi] - xor ecx,DWORD [12+eax*1+esi] - xor edx,DWORD [eax*1+esi] - xor ebp,DWORD [4+eax*1+esi] - dec edi - js NEAR L$004x86_break - mov al,bl - shrd ebx,ecx,4 - and al,15 - shrd ecx,edx,4 - shrd edx,ebp,4 - shr ebp,4 - xor ebp,DWORD [16+eax*4+esp] - mov al,BYTE [edi*1+esp] - shl al,4 - xor ebx,DWORD [8+eax*1+esi] - xor ecx,DWORD [12+eax*1+esi] - xor edx,DWORD [eax*1+esi] - xor ebp,DWORD [4+eax*1+esi] - jmp NEAR L$003x86_loop -align 16 -L$004x86_break: - bswap ebx - bswap ecx - bswap edx - bswap ebp - mov edi,DWORD [112+esp] - lea edi,[16+edi] - cmp edi,DWORD [116+esp] - mov DWORD [112+esp],edi - jb NEAR L$002x86_outer_loop - mov edi,DWORD [104+esp] - mov DWORD [12+edi],ebx - mov DWORD [8+edi],ecx - mov DWORD [4+edi],edx - mov DWORD [edi],ebp - add esp,84 - pop edi - pop esi - pop ebx - pop ebp - ret global _gcm_gmult_4bit_mmx align 16 _gcm_gmult_4bit_mmx: @@ -223,10 +24,10 @@ L$_gcm_gmult_4bit_mmx_begin: push edi mov edi,DWORD [20+esp] mov esi,DWORD [24+esp] - call L$005pic_point -L$005pic_point: + call L$000pic_point +L$000pic_point: pop eax - lea eax,[(L$rem_4bit-L$005pic_point)+eax] + lea eax,[(L$rem_4bit-L$000pic_point)+eax] movzx ebx,BYTE [15+edi] xor ecx,ecx mov edx,ebx @@ -237,9 +38,9 @@ L$005pic_point: movq mm0,[8+ecx*1+esi] movq mm1,[ecx*1+esi] movd ebx,mm0 - jmp NEAR L$006mmx_loop + jmp NEAR L$001mmx_loop align 16 -L$006mmx_loop: +L$001mmx_loop: psrlq mm0,4 and ebx,15 movq mm2,mm1 @@ -253,7 +54,7 @@ L$006mmx_loop: pxor mm1,[edx*1+esi] mov edx,ecx pxor mm0,mm2 - js NEAR L$007mmx_break + js NEAR L$002mmx_break shl cl,4 and ebx,15 psrlq mm0,4 @@ -266,9 +67,9 @@ L$006mmx_loop: movd ebx,mm0 pxor mm1,[ecx*1+esi] pxor mm0,mm2 - jmp NEAR L$006mmx_loop + jmp NEAR L$001mmx_loop align 16 -L$007mmx_break: +L$002mmx_break: shl cl,4 and ebx,15 psrlq mm0,4 @@ -323,10 +124,10 @@ L$_gcm_ghash_4bit_mmx_begin: mov ecx,DWORD [28+esp] mov edx,DWORD [32+esp] mov ebp,esp - call L$008pic_point -L$008pic_point: + call L$003pic_point +L$003pic_point: pop esi - lea esi,[(L$rem_8bit-L$008pic_point)+esi] + lea esi,[(L$rem_8bit-L$003pic_point)+esi] sub esp,544 and esp,-64 sub esp,16 @@ -565,7 +366,7 @@ L$008pic_point: mov ebx,DWORD [8+eax] mov edx,DWORD [12+eax] align 16 -L$009outer: +L$004outer: xor edx,DWORD [12+ecx] xor ebx,DWORD [8+ecx] pxor mm6,[ecx] @@ -900,7 +701,7 @@ L$009outer: pshufw mm6,mm6,27 bswap ebx cmp ecx,DWORD [552+esp] - jne NEAR L$009outer + jne NEAR L$004outer mov eax,DWORD [544+esp] mov DWORD [12+eax],edx mov DWORD [8+eax],ebx @@ -918,10 +719,10 @@ _gcm_init_clmul: L$_gcm_init_clmul_begin: mov edx,DWORD [4+esp] mov eax,DWORD [8+esp] - call L$010pic -L$010pic: + call L$005pic +L$005pic: pop ecx - lea ecx,[(L$bswap-L$010pic)+ecx] + lea ecx,[(L$bswap-L$005pic)+ecx] movdqu xmm2,[eax] pshufd xmm2,xmm2,78 pshufd xmm4,xmm2,255 @@ -985,10 +786,10 @@ _gcm_gmult_clmul: L$_gcm_gmult_clmul_begin: mov eax,DWORD [4+esp] mov edx,DWORD [8+esp] - call L$011pic -L$011pic: + call L$006pic +L$006pic: pop ecx - lea ecx,[(L$bswap-L$011pic)+ecx] + lea ecx,[(L$bswap-L$006pic)+ecx] movdqu xmm0,[eax] movdqa xmm5,[ecx] movups xmm2,[edx] @@ -1042,16 +843,16 @@ L$_gcm_ghash_clmul_begin: mov edx,DWORD [24+esp] mov esi,DWORD [28+esp] mov ebx,DWORD [32+esp] - call L$012pic -L$012pic: + call L$007pic +L$007pic: pop ecx - lea ecx,[(L$bswap-L$012pic)+ecx] + lea ecx,[(L$bswap-L$007pic)+ecx] movdqu xmm0,[eax] movdqa xmm5,[ecx] movdqu xmm2,[edx] db 102,15,56,0,197 sub ebx,16 - jz NEAR L$013odd_tail + jz NEAR L$008odd_tail movdqu xmm3,[esi] movdqu xmm6,[16+esi] db 102,15,56,0,221 @@ -1068,10 +869,10 @@ db 102,15,58,68,221,0 movups xmm2,[16+edx] nop sub ebx,32 - jbe NEAR L$014even_tail - jmp NEAR L$015mod_loop + jbe NEAR L$009even_tail + jmp NEAR L$010mod_loop align 32 -L$015mod_loop: +L$010mod_loop: pshufd xmm4,xmm0,78 movdqa xmm1,xmm0 pxor xmm4,xmm0 @@ -1126,8 +927,8 @@ db 102,15,58,68,250,17 db 102,15,58,68,221,0 lea esi,[32+esi] sub ebx,32 - ja NEAR L$015mod_loop -L$014even_tail: + ja NEAR L$010mod_loop +L$009even_tail: pshufd xmm4,xmm0,78 movdqa xmm1,xmm0 pxor xmm4,xmm0 @@ -1166,9 +967,9 @@ db 102,15,58,68,229,16 psrlq xmm0,1 pxor xmm0,xmm1 test ebx,ebx - jnz NEAR L$016done + jnz NEAR L$011done movups xmm2,[edx] -L$013odd_tail: +L$008odd_tail: movdqu xmm3,[esi] db 102,15,56,0,221 pxor xmm0,xmm3 @@ -1207,7 +1008,7 @@ db 102,15,58,68,220,0 pxor xmm0,xmm4 psrlq xmm0,1 pxor xmm0,xmm1 -L$016done: +L$011done: db 102,15,56,0,197 movdqu [eax],xmm0 pop edi diff --git a/packager/third_party/boringssl/win-x86/crypto/md5/md5-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/md5-586.asm similarity index 100% rename from packager/third_party/boringssl/win-x86/crypto/md5/md5-586.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/md5-586.asm diff --git a/packager/third_party/boringssl/win-x86/crypto/sha/sha1-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha1-586.asm similarity index 100% rename from packager/third_party/boringssl/win-x86/crypto/sha/sha1-586.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha1-586.asm diff --git a/packager/third_party/boringssl/win-x86/crypto/sha/sha256-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha256-586.asm similarity index 100% rename from packager/third_party/boringssl/win-x86/crypto/sha/sha256-586.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha256-586.asm diff --git a/packager/third_party/boringssl/win-x86/crypto/sha/sha512-586.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha512-586.asm similarity index 100% rename from packager/third_party/boringssl/win-x86/crypto/sha/sha512-586.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/sha512-586.asm diff --git a/packager/third_party/boringssl/win-x86/crypto/aes/vpaes-x86.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/vpaes-x86.asm similarity index 100% rename from packager/third_party/boringssl/win-x86/crypto/aes/vpaes-x86.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/vpaes-x86.asm diff --git a/packager/third_party/boringssl/win-x86/crypto/bn/x86-mont.asm b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm similarity index 86% rename from packager/third_party/boringssl/win-x86/crypto/bn/x86-mont.asm rename to packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm index de7b949927..b1a4d59429 100644 --- a/packager/third_party/boringssl/win-x86/crypto/bn/x86-mont.asm +++ b/packager/third_party/boringssl/win-x86/crypto/fipsmodule/x86-mont.asm @@ -29,36 +29,51 @@ L$_bn_mul_mont_begin: jl NEAR L$000just_leave lea esi,[20+esp] lea edx,[24+esp] - mov ebp,esp add edi,2 neg edi - lea esp,[edi*4+esp-32] + lea ebp,[edi*4+esp-32] neg edi - mov eax,esp + mov eax,ebp sub eax,edx and eax,2047 - sub esp,eax - xor edx,esp + sub ebp,eax + xor edx,ebp and edx,2048 xor edx,2048 - sub esp,edx - and esp,-64 + sub ebp,edx + and ebp,-64 + mov eax,esp + sub eax,ebp + and eax,-4096 + mov edx,esp + lea esp,[eax*1+ebp] + mov eax,DWORD [esp] + cmp esp,ebp + ja NEAR L$001page_walk + jmp NEAR L$002page_walk_done +align 16 +L$001page_walk: + lea esp,[esp-4096] + mov eax,DWORD [esp] + cmp esp,ebp + ja NEAR L$001page_walk +L$002page_walk_done: mov eax,DWORD [esi] mov ebx,DWORD [4+esi] mov ecx,DWORD [8+esi] - mov edx,DWORD [12+esi] + mov ebp,DWORD [12+esi] mov esi,DWORD [16+esi] mov esi,DWORD [esi] mov DWORD [4+esp],eax mov DWORD [8+esp],ebx mov DWORD [12+esp],ecx - mov DWORD [16+esp],edx + mov DWORD [16+esp],ebp mov DWORD [20+esp],esi lea ebx,[edi-3] - mov DWORD [24+esp],ebp + mov DWORD [24+esp],edx lea eax,[_OPENSSL_ia32cap_P] bt DWORD [eax],26 - jnc NEAR L$001non_sse2 + jnc NEAR L$003non_sse2 mov eax,-1 movd mm7,eax mov esi,DWORD [8+esp] @@ -82,7 +97,7 @@ L$_bn_mul_mont_begin: psrlq mm3,32 inc ecx align 16 -L$0021st: +L$0041st: pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 @@ -97,7 +112,7 @@ L$0021st: psrlq mm3,32 lea ecx,[1+ecx] cmp ecx,ebx - jl NEAR L$0021st + jl NEAR L$0041st pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 @@ -111,7 +126,7 @@ L$0021st: paddq mm3,mm2 movq [32+ebx*4+esp],mm3 inc edx -L$003outer: +L$005outer: xor ecx,ecx movd mm4,DWORD [edx*4+edi] movd mm5,DWORD [esi] @@ -133,7 +148,7 @@ L$003outer: paddq mm2,mm6 inc ecx dec ebx -L$004inner: +L$006inner: pmuludq mm0,mm4 pmuludq mm1,mm5 paddq mm2,mm0 @@ -150,7 +165,7 @@ L$004inner: paddq mm2,mm6 dec ebx lea ecx,[1+ecx] - jnz NEAR L$004inner + jnz NEAR L$006inner mov ebx,ecx pmuludq mm0,mm4 pmuludq mm1,mm5 @@ -168,11 +183,11 @@ L$004inner: movq [32+ebx*4+esp],mm3 lea edx,[1+edx] cmp edx,ebx - jle NEAR L$003outer + jle NEAR L$005outer emms - jmp NEAR L$005common_tail + jmp NEAR L$007common_tail align 16 -L$001non_sse2: +L$003non_sse2: mov esi,DWORD [8+esp] lea ebp,[1+ebx] mov edi,DWORD [12+esp] @@ -183,12 +198,12 @@ L$001non_sse2: lea eax,[4+ebx*4+edi] or ebp,edx mov edi,DWORD [edi] - jz NEAR L$006bn_sqr_mont + jz NEAR L$008bn_sqr_mont mov DWORD [28+esp],eax mov eax,DWORD [esi] xor edx,edx align 16 -L$007mull: +L$009mull: mov ebp,edx mul edi add ebp,eax @@ -197,7 +212,7 @@ L$007mull: mov eax,DWORD [ecx*4+esi] cmp ecx,ebx mov DWORD [28+ecx*4+esp],ebp - jl NEAR L$007mull + jl NEAR L$009mull mov ebp,edx mul edi mov edi,DWORD [20+esp] @@ -215,9 +230,9 @@ L$007mull: mov eax,DWORD [4+esi] adc edx,0 inc ecx - jmp NEAR L$0082ndmadd + jmp NEAR L$0102ndmadd align 16 -L$0091stmadd: +L$0111stmadd: mov ebp,edx mul edi add ebp,DWORD [32+ecx*4+esp] @@ -228,7 +243,7 @@ L$0091stmadd: adc edx,0 cmp ecx,ebx mov DWORD [28+ecx*4+esp],ebp - jl NEAR L$0091stmadd + jl NEAR L$0111stmadd mov ebp,edx mul edi add eax,DWORD [32+ebx*4+esp] @@ -251,7 +266,7 @@ L$0091stmadd: adc edx,0 mov ecx,1 align 16 -L$0082ndmadd: +L$0102ndmadd: mov ebp,edx mul edi add ebp,DWORD [32+ecx*4+esp] @@ -262,7 +277,7 @@ L$0082ndmadd: adc edx,0 cmp ecx,ebx mov DWORD [24+ecx*4+esp],ebp - jl NEAR L$0082ndmadd + jl NEAR L$0102ndmadd mov ebp,edx mul edi add ebp,DWORD [32+ebx*4+esp] @@ -278,16 +293,16 @@ L$0082ndmadd: mov DWORD [32+ebx*4+esp],edx cmp ecx,DWORD [28+esp] mov DWORD [36+ebx*4+esp],eax - je NEAR L$005common_tail + je NEAR L$007common_tail mov edi,DWORD [ecx] mov esi,DWORD [8+esp] mov DWORD [12+esp],ecx xor ecx,ecx xor edx,edx mov eax,DWORD [esi] - jmp NEAR L$0091stmadd + jmp NEAR L$0111stmadd align 16 -L$006bn_sqr_mont: +L$008bn_sqr_mont: mov DWORD [esp],ebx mov DWORD [12+esp],ecx mov eax,edi @@ -298,7 +313,7 @@ L$006bn_sqr_mont: and ebx,1 inc ecx align 16 -L$010sqr: +L$012sqr: mov eax,DWORD [ecx*4+esi] mov ebp,edx mul edi @@ -310,7 +325,7 @@ L$010sqr: cmp ecx,DWORD [esp] mov ebx,eax mov DWORD [28+ecx*4+esp],ebp - jl NEAR L$010sqr + jl NEAR L$012sqr mov eax,DWORD [ecx*4+esi] mov ebp,edx mul edi @@ -334,7 +349,7 @@ L$010sqr: mov eax,DWORD [4+esi] mov ecx,1 align 16 -L$0113rdmadd: +L$0133rdmadd: mov ebp,edx mul edi add ebp,DWORD [32+ecx*4+esp] @@ -353,7 +368,7 @@ L$0113rdmadd: adc edx,0 cmp ecx,ebx mov DWORD [24+ecx*4+esp],ebp - jl NEAR L$0113rdmadd + jl NEAR L$0133rdmadd mov ebp,edx mul edi add ebp,DWORD [32+ebx*4+esp] @@ -369,7 +384,7 @@ L$0113rdmadd: mov DWORD [32+ebx*4+esp],edx cmp ecx,ebx mov DWORD [36+ebx*4+esp],eax - je NEAR L$005common_tail + je NEAR L$007common_tail mov edi,DWORD [4+ecx*4+esi] lea ecx,[1+ecx] mov eax,edi @@ -381,12 +396,12 @@ L$0113rdmadd: xor ebp,ebp cmp ecx,ebx lea ecx,[1+ecx] - je NEAR L$012sqrlast + je NEAR L$014sqrlast mov ebx,edx shr edx,1 and ebx,1 align 16 -L$013sqradd: +L$015sqradd: mov eax,DWORD [ecx*4+esi] mov ebp,edx mul edi @@ -402,13 +417,13 @@ L$013sqradd: cmp ecx,DWORD [esp] mov DWORD [28+ecx*4+esp],ebp mov ebx,eax - jle NEAR L$013sqradd + jle NEAR L$015sqradd mov ebp,edx add edx,edx shr ebp,31 add edx,ebx adc ebp,0 -L$012sqrlast: +L$014sqrlast: mov edi,DWORD [20+esp] mov esi,DWORD [16+esp] imul edi,DWORD [32+esp] @@ -423,9 +438,9 @@ L$012sqrlast: adc edx,0 mov ecx,1 mov eax,DWORD [4+esi] - jmp NEAR L$0113rdmadd + jmp NEAR L$0133rdmadd align 16 -L$005common_tail: +L$007common_tail: mov ebp,DWORD [16+esp] mov edi,DWORD [4+esp] lea esi,[32+esp] @@ -433,25 +448,26 @@ L$005common_tail: mov ecx,ebx xor edx,edx align 16 -L$014sub: +L$016sub: sbb eax,DWORD [edx*4+ebp] mov DWORD [edx*4+edi],eax dec ecx mov eax,DWORD [4+edx*4+esi] lea edx,[1+edx] - jge NEAR L$014sub + jge NEAR L$016sub sbb eax,0 + and esi,eax + not eax + mov ebp,edi + and ebp,eax + or esi,ebp align 16 -L$015copy: - mov edx,DWORD [ebx*4+esi] - mov ebp,DWORD [ebx*4+edi] - xor edx,ebp - and edx,eax - xor edx,ebp - mov DWORD [ebx*4+esi],ecx - mov DWORD [ebx*4+edi],edx +L$017copy: + mov eax,DWORD [ebx*4+esi] + mov DWORD [ebx*4+edi],eax + mov DWORD [32+ebx*4+esp],ecx dec ebx - jge NEAR L$015copy + jge NEAR L$017copy mov esp,DWORD [24+esp] mov eax,1 L$000just_leave: diff --git a/packager/third_party/boringssl/win-x86/crypto/rc4/rc4-586.asm b/packager/third_party/boringssl/win-x86/crypto/rc4/rc4-586.asm deleted file mode 100644 index 0bab2bec85..0000000000 --- a/packager/third_party/boringssl/win-x86/crypto/rc4/rc4-586.asm +++ /dev/null @@ -1,353 +0,0 @@ -%ifidn __OUTPUT_FORMAT__,obj -section code use32 class=code align=64 -%elifidn __OUTPUT_FORMAT__,win32 -%ifdef __YASM_VERSION_ID__ -%if __YASM_VERSION_ID__ < 01010000h -%error yasm version 1.1.0 or later needed. -%endif -; Yasm automatically includes .00 and complains about redefining it. -; https://www.tortall.net/projects/yasm/manual/html/objfmt-win32-safeseh.html -%else -$@feat.00 equ 1 -%endif -section .text code align=64 -%else -section .text code -%endif -;extern _OPENSSL_ia32cap_P -global _asm_RC4 -align 16 -_asm_RC4: -L$_asm_RC4_begin: - push ebp - push ebx - push esi - push edi - mov edi,DWORD [20+esp] - mov edx,DWORD [24+esp] - mov esi,DWORD [28+esp] - mov ebp,DWORD [32+esp] - xor eax,eax - xor ebx,ebx - cmp edx,0 - je NEAR L$000abort - mov al,BYTE [edi] - mov bl,BYTE [4+edi] - add edi,8 - lea ecx,[edx*1+esi] - sub ebp,esi - mov DWORD [24+esp],ecx - inc al - cmp DWORD [256+edi],-1 - je NEAR L$001RC4_CHAR - mov ecx,DWORD [eax*4+edi] - and edx,-4 - jz NEAR L$002loop1 - mov DWORD [32+esp],ebp - test edx,-8 - jz NEAR L$003go4loop4 - lea ebp,[_OPENSSL_ia32cap_P] - bt DWORD [ebp],26 - jnc NEAR L$003go4loop4 - mov ebp,DWORD [32+esp] - and edx,-8 - lea edx,[edx*1+esi-8] - mov DWORD [edi-4],edx - add bl,cl - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - inc eax - add edx,ecx - movzx eax,al - movzx edx,dl - movq mm0,[esi] - mov ecx,DWORD [eax*4+edi] - movd mm2,DWORD [edx*4+edi] - jmp NEAR L$004loop_mmx_enter -align 16 -L$005loop_mmx: - add bl,cl - psllq mm1,56 - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - inc eax - add edx,ecx - movzx eax,al - movzx edx,dl - pxor mm2,mm1 - movq mm0,[esi] - movq [esi*1+ebp-8],mm2 - mov ecx,DWORD [eax*4+edi] - movd mm2,DWORD [edx*4+edi] -L$004loop_mmx_enter: - add bl,cl - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - inc eax - add edx,ecx - movzx eax,al - movzx edx,dl - pxor mm2,mm0 - mov ecx,DWORD [eax*4+edi] - movd mm1,DWORD [edx*4+edi] - add bl,cl - psllq mm1,8 - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - inc eax - add edx,ecx - movzx eax,al - movzx edx,dl - pxor mm2,mm1 - mov ecx,DWORD [eax*4+edi] - movd mm1,DWORD [edx*4+edi] - add bl,cl - psllq mm1,16 - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - inc eax - add edx,ecx - movzx eax,al - movzx edx,dl - pxor mm2,mm1 - mov ecx,DWORD [eax*4+edi] - movd mm1,DWORD [edx*4+edi] - add bl,cl - psllq mm1,24 - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - inc eax - add edx,ecx - movzx eax,al - movzx edx,dl - pxor mm2,mm1 - mov ecx,DWORD [eax*4+edi] - movd mm1,DWORD [edx*4+edi] - add bl,cl - psllq mm1,32 - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - inc eax - add edx,ecx - movzx eax,al - movzx edx,dl - pxor mm2,mm1 - mov ecx,DWORD [eax*4+edi] - movd mm1,DWORD [edx*4+edi] - add bl,cl - psllq mm1,40 - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - inc eax - add edx,ecx - movzx eax,al - movzx edx,dl - pxor mm2,mm1 - mov ecx,DWORD [eax*4+edi] - movd mm1,DWORD [edx*4+edi] - add bl,cl - psllq mm1,48 - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - inc eax - add edx,ecx - movzx eax,al - movzx edx,dl - pxor mm2,mm1 - mov ecx,DWORD [eax*4+edi] - movd mm1,DWORD [edx*4+edi] - mov edx,ebx - xor ebx,ebx - mov bl,dl - cmp esi,DWORD [edi-4] - lea esi,[8+esi] - jb NEAR L$005loop_mmx - psllq mm1,56 - pxor mm2,mm1 - movq [esi*1+ebp-8],mm2 - emms - cmp esi,DWORD [24+esp] - je NEAR L$006done - jmp NEAR L$002loop1 -align 16 -L$003go4loop4: - lea edx,[edx*1+esi-4] - mov DWORD [28+esp],edx -L$007loop4: - add bl,cl - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - add edx,ecx - inc al - and edx,255 - mov ecx,DWORD [eax*4+edi] - mov ebp,DWORD [edx*4+edi] - add bl,cl - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - add edx,ecx - inc al - and edx,255 - ror ebp,8 - mov ecx,DWORD [eax*4+edi] - or ebp,DWORD [edx*4+edi] - add bl,cl - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - add edx,ecx - inc al - and edx,255 - ror ebp,8 - mov ecx,DWORD [eax*4+edi] - or ebp,DWORD [edx*4+edi] - add bl,cl - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - add edx,ecx - inc al - and edx,255 - ror ebp,8 - mov ecx,DWORD [32+esp] - or ebp,DWORD [edx*4+edi] - ror ebp,8 - xor ebp,DWORD [esi] - cmp esi,DWORD [28+esp] - mov DWORD [esi*1+ecx],ebp - lea esi,[4+esi] - mov ecx,DWORD [eax*4+edi] - jb NEAR L$007loop4 - cmp esi,DWORD [24+esp] - je NEAR L$006done - mov ebp,DWORD [32+esp] -align 16 -L$002loop1: - add bl,cl - mov edx,DWORD [ebx*4+edi] - mov DWORD [ebx*4+edi],ecx - mov DWORD [eax*4+edi],edx - add edx,ecx - inc al - and edx,255 - mov edx,DWORD [edx*4+edi] - xor dl,BYTE [esi] - lea esi,[1+esi] - mov ecx,DWORD [eax*4+edi] - cmp esi,DWORD [24+esp] - mov BYTE [esi*1+ebp-1],dl - jb NEAR L$002loop1 - jmp NEAR L$006done -align 16 -L$001RC4_CHAR: - movzx ecx,BYTE [eax*1+edi] -L$008cloop1: - add bl,cl - movzx edx,BYTE [ebx*1+edi] - mov BYTE [ebx*1+edi],cl - mov BYTE [eax*1+edi],dl - add dl,cl - movzx edx,BYTE [edx*1+edi] - add al,1 - xor dl,BYTE [esi] - lea esi,[1+esi] - movzx ecx,BYTE [eax*1+edi] - cmp esi,DWORD [24+esp] - mov BYTE [esi*1+ebp-1],dl - jb NEAR L$008cloop1 -L$006done: - dec al - mov DWORD [edi-4],ebx - mov BYTE [edi-8],al -L$000abort: - pop edi - pop esi - pop ebx - pop ebp - ret -global _asm_RC4_set_key -align 16 -_asm_RC4_set_key: -L$_asm_RC4_set_key_begin: - push ebp - push ebx - push esi - push edi - mov edi,DWORD [20+esp] - mov ebp,DWORD [24+esp] - mov esi,DWORD [28+esp] - lea edx,[_OPENSSL_ia32cap_P] - lea edi,[8+edi] - lea esi,[ebp*1+esi] - neg ebp - xor eax,eax - mov DWORD [edi-4],ebp - bt DWORD [edx],20 - jc NEAR L$009c1stloop -align 16 -L$010w1stloop: - mov DWORD [eax*4+edi],eax - add al,1 - jnc NEAR L$010w1stloop - xor ecx,ecx - xor edx,edx -align 16 -L$011w2ndloop: - mov eax,DWORD [ecx*4+edi] - add dl,BYTE [ebp*1+esi] - add dl,al - add ebp,1 - mov ebx,DWORD [edx*4+edi] - jnz NEAR L$012wnowrap - mov ebp,DWORD [edi-4] -L$012wnowrap: - mov DWORD [edx*4+edi],eax - mov DWORD [ecx*4+edi],ebx - add cl,1 - jnc NEAR L$011w2ndloop - jmp NEAR L$013exit -align 16 -L$009c1stloop: - mov BYTE [eax*1+edi],al - add al,1 - jnc NEAR L$009c1stloop - xor ecx,ecx - xor edx,edx - xor ebx,ebx -align 16 -L$014c2ndloop: - mov al,BYTE [ecx*1+edi] - add dl,BYTE [ebp*1+esi] - add dl,al - add ebp,1 - mov bl,BYTE [edx*1+edi] - jnz NEAR L$015cnowrap - mov ebp,DWORD [edi-4] -L$015cnowrap: - mov BYTE [edx*1+edi],al - mov BYTE [ecx*1+edi],bl - add cl,1 - jnc NEAR L$014c2ndloop - mov DWORD [256+edi],-1 -L$013exit: - xor eax,eax - mov DWORD [edi-8],eax - mov DWORD [edi-4],eax - pop edi - pop esi - pop ebx - pop ebp - ret -segment .bss -common _OPENSSL_ia32cap_P 16 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/bn/rsaz-avx2.asm b/packager/third_party/boringssl/win-x86_64/crypto/bn/rsaz-avx2.asm deleted file mode 100644 index 45d0fd4632..0000000000 --- a/packager/third_party/boringssl/win-x86_64/crypto/bn/rsaz-avx2.asm +++ /dev/null @@ -1,30 +0,0 @@ -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD -section .text code align=64 - - -global rsaz_avx2_eligible - -rsaz_avx2_eligible: - xor eax,eax - DB 0F3h,0C3h ;repret - - -global rsaz_1024_sqr_avx2 -global rsaz_1024_mul_avx2 -global rsaz_1024_norm2red_avx2 -global rsaz_1024_red2norm_avx2 -global rsaz_1024_scatter5_avx2 -global rsaz_1024_gather5_avx2 - -rsaz_1024_sqr_avx2: -rsaz_1024_mul_avx2: -rsaz_1024_norm2red_avx2: -rsaz_1024_red2norm_avx2: -rsaz_1024_scatter5_avx2: -rsaz_1024_gather5_avx2: -DB 0x0f,0x0b - DB 0F3h,0C3h ;repret - diff --git a/packager/third_party/boringssl/win-x86_64/crypto/bn/rsaz-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/bn/rsaz-x86_64.asm deleted file mode 100644 index 72ec505289..0000000000 --- a/packager/third_party/boringssl/win-x86_64/crypto/bn/rsaz-x86_64.asm +++ /dev/null @@ -1,1495 +0,0 @@ -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD -section .text code align=64 - - -EXTERN OPENSSL_ia32cap_P - -global rsaz_512_sqr - -ALIGN 32 -rsaz_512_sqr: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_rsaz_512_sqr: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - - - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - - sub rsp,128+24 -$L$sqr_body: - mov rbp,rdx - mov rdx,QWORD[rsi] - mov rax,QWORD[8+rsi] - mov QWORD[128+rsp],rcx - jmp NEAR $L$oop_sqr - -ALIGN 32 -$L$oop_sqr: - mov DWORD[((128+8))+rsp],r8d - - mov rbx,rdx - mul rdx - mov r8,rax - mov rax,QWORD[16+rsi] - mov r9,rdx - - mul rbx - add r9,rax - mov rax,QWORD[24+rsi] - mov r10,rdx - adc r10,0 - - mul rbx - add r10,rax - mov rax,QWORD[32+rsi] - mov r11,rdx - adc r11,0 - - mul rbx - add r11,rax - mov rax,QWORD[40+rsi] - mov r12,rdx - adc r12,0 - - mul rbx - add r12,rax - mov rax,QWORD[48+rsi] - mov r13,rdx - adc r13,0 - - mul rbx - add r13,rax - mov rax,QWORD[56+rsi] - mov r14,rdx - adc r14,0 - - mul rbx - add r14,rax - mov rax,rbx - mov r15,rdx - adc r15,0 - - add r8,r8 - mov rcx,r9 - adc r9,r9 - - mul rax - mov QWORD[rsp],rax - add r8,rdx - adc r9,0 - - mov QWORD[8+rsp],r8 - shr rcx,63 - - - mov r8,QWORD[8+rsi] - mov rax,QWORD[16+rsi] - mul r8 - add r10,rax - mov rax,QWORD[24+rsi] - mov rbx,rdx - adc rbx,0 - - mul r8 - add r11,rax - mov rax,QWORD[32+rsi] - adc rdx,0 - add r11,rbx - mov rbx,rdx - adc rbx,0 - - mul r8 - add r12,rax - mov rax,QWORD[40+rsi] - adc rdx,0 - add r12,rbx - mov rbx,rdx - adc rbx,0 - - mul r8 - add r13,rax - mov rax,QWORD[48+rsi] - adc rdx,0 - add r13,rbx - mov rbx,rdx - adc rbx,0 - - mul r8 - add r14,rax - mov rax,QWORD[56+rsi] - adc rdx,0 - add r14,rbx - mov rbx,rdx - adc rbx,0 - - mul r8 - add r15,rax - mov rax,r8 - adc rdx,0 - add r15,rbx - mov r8,rdx - mov rdx,r10 - adc r8,0 - - add rdx,rdx - lea r10,[r10*2+rcx] - mov rbx,r11 - adc r11,r11 - - mul rax - add r9,rax - adc r10,rdx - adc r11,0 - - mov QWORD[16+rsp],r9 - mov QWORD[24+rsp],r10 - shr rbx,63 - - - mov r9,QWORD[16+rsi] - mov rax,QWORD[24+rsi] - mul r9 - add r12,rax - mov rax,QWORD[32+rsi] - mov rcx,rdx - adc rcx,0 - - mul r9 - add r13,rax - mov rax,QWORD[40+rsi] - adc rdx,0 - add r13,rcx - mov rcx,rdx - adc rcx,0 - - mul r9 - add r14,rax - mov rax,QWORD[48+rsi] - adc rdx,0 - add r14,rcx - mov rcx,rdx - adc rcx,0 - - mul r9 - mov r10,r12 - lea r12,[r12*2+rbx] - add r15,rax - mov rax,QWORD[56+rsi] - adc rdx,0 - add r15,rcx - mov rcx,rdx - adc rcx,0 - - mul r9 - shr r10,63 - add r8,rax - mov rax,r9 - adc rdx,0 - add r8,rcx - mov r9,rdx - adc r9,0 - - mov rcx,r13 - lea r13,[r13*2+r10] - - mul rax - add r11,rax - adc r12,rdx - adc r13,0 - - mov QWORD[32+rsp],r11 - mov QWORD[40+rsp],r12 - shr rcx,63 - - - mov r10,QWORD[24+rsi] - mov rax,QWORD[32+rsi] - mul r10 - add r14,rax - mov rax,QWORD[40+rsi] - mov rbx,rdx - adc rbx,0 - - mul r10 - add r15,rax - mov rax,QWORD[48+rsi] - adc rdx,0 - add r15,rbx - mov rbx,rdx - adc rbx,0 - - mul r10 - mov r12,r14 - lea r14,[r14*2+rcx] - add r8,rax - mov rax,QWORD[56+rsi] - adc rdx,0 - add r8,rbx - mov rbx,rdx - adc rbx,0 - - mul r10 - shr r12,63 - add r9,rax - mov rax,r10 - adc rdx,0 - add r9,rbx - mov r10,rdx - adc r10,0 - - mov rbx,r15 - lea r15,[r15*2+r12] - - mul rax - add r13,rax - adc r14,rdx - adc r15,0 - - mov QWORD[48+rsp],r13 - mov QWORD[56+rsp],r14 - shr rbx,63 - - - mov r11,QWORD[32+rsi] - mov rax,QWORD[40+rsi] - mul r11 - add r8,rax - mov rax,QWORD[48+rsi] - mov rcx,rdx - adc rcx,0 - - mul r11 - add r9,rax - mov rax,QWORD[56+rsi] - adc rdx,0 - mov r12,r8 - lea r8,[r8*2+rbx] - add r9,rcx - mov rcx,rdx - adc rcx,0 - - mul r11 - shr r12,63 - add r10,rax - mov rax,r11 - adc rdx,0 - add r10,rcx - mov r11,rdx - adc r11,0 - - mov rcx,r9 - lea r9,[r9*2+r12] - - mul rax - add r15,rax - adc r8,rdx - adc r9,0 - - mov QWORD[64+rsp],r15 - mov QWORD[72+rsp],r8 - shr rcx,63 - - - mov r12,QWORD[40+rsi] - mov rax,QWORD[48+rsi] - mul r12 - add r10,rax - mov rax,QWORD[56+rsi] - mov rbx,rdx - adc rbx,0 - - mul r12 - add r11,rax - mov rax,r12 - mov r15,r10 - lea r10,[r10*2+rcx] - adc rdx,0 - shr r15,63 - add r11,rbx - mov r12,rdx - adc r12,0 - - mov rbx,r11 - lea r11,[r11*2+r15] - - mul rax - add r9,rax - adc r10,rdx - adc r11,0 - - mov QWORD[80+rsp],r9 - mov QWORD[88+rsp],r10 - - - mov r13,QWORD[48+rsi] - mov rax,QWORD[56+rsi] - mul r13 - add r12,rax - mov rax,r13 - mov r13,rdx - adc r13,0 - - xor r14,r14 - shl rbx,1 - adc r12,r12 - adc r13,r13 - adc r14,r14 - - mul rax - add r11,rax - adc r12,rdx - adc r13,0 - - mov QWORD[96+rsp],r11 - mov QWORD[104+rsp],r12 - - - mov rax,QWORD[56+rsi] - mul rax - add r13,rax - adc rdx,0 - - add r14,rdx - - mov QWORD[112+rsp],r13 - mov QWORD[120+rsp],r14 - - mov r8,QWORD[rsp] - mov r9,QWORD[8+rsp] - mov r10,QWORD[16+rsp] - mov r11,QWORD[24+rsp] - mov r12,QWORD[32+rsp] - mov r13,QWORD[40+rsp] - mov r14,QWORD[48+rsp] - mov r15,QWORD[56+rsp] - - call __rsaz_512_reduce - - add r8,QWORD[64+rsp] - adc r9,QWORD[72+rsp] - adc r10,QWORD[80+rsp] - adc r11,QWORD[88+rsp] - adc r12,QWORD[96+rsp] - adc r13,QWORD[104+rsp] - adc r14,QWORD[112+rsp] - adc r15,QWORD[120+rsp] - sbb rcx,rcx - - call __rsaz_512_subtract - - mov rdx,r8 - mov rax,r9 - mov r8d,DWORD[((128+8))+rsp] - mov rsi,rdi - - dec r8d - jnz NEAR $L$oop_sqr - - lea rax,[((128+24+48))+rsp] - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - lea rsp,[rax] -$L$sqr_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_rsaz_512_sqr: -global rsaz_512_mul - -ALIGN 32 -rsaz_512_mul: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_rsaz_512_mul: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - - - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - - sub rsp,128+24 -$L$mul_body: -DB 102,72,15,110,199 -DB 102,72,15,110,201 - mov QWORD[128+rsp],r8 - mov rbx,QWORD[rdx] - mov rbp,rdx - call __rsaz_512_mul - -DB 102,72,15,126,199 -DB 102,72,15,126,205 - - mov r8,QWORD[rsp] - mov r9,QWORD[8+rsp] - mov r10,QWORD[16+rsp] - mov r11,QWORD[24+rsp] - mov r12,QWORD[32+rsp] - mov r13,QWORD[40+rsp] - mov r14,QWORD[48+rsp] - mov r15,QWORD[56+rsp] - - call __rsaz_512_reduce - add r8,QWORD[64+rsp] - adc r9,QWORD[72+rsp] - adc r10,QWORD[80+rsp] - adc r11,QWORD[88+rsp] - adc r12,QWORD[96+rsp] - adc r13,QWORD[104+rsp] - adc r14,QWORD[112+rsp] - adc r15,QWORD[120+rsp] - sbb rcx,rcx - - call __rsaz_512_subtract - - lea rax,[((128+24+48))+rsp] - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - lea rsp,[rax] -$L$mul_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_rsaz_512_mul: -global rsaz_512_mul_gather4 - -ALIGN 32 -rsaz_512_mul_gather4: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_rsaz_512_mul_gather4: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - - sub rsp,328 - movaps XMMWORD[160+rsp],xmm6 - movaps XMMWORD[176+rsp],xmm7 - movaps XMMWORD[192+rsp],xmm8 - movaps XMMWORD[208+rsp],xmm9 - movaps XMMWORD[224+rsp],xmm10 - movaps XMMWORD[240+rsp],xmm11 - movaps XMMWORD[256+rsp],xmm12 - movaps XMMWORD[272+rsp],xmm13 - movaps XMMWORD[288+rsp],xmm14 - movaps XMMWORD[304+rsp],xmm15 -$L$mul_gather4_body: - movd xmm8,r9d - movdqa xmm1,XMMWORD[(($L$inc+16))] - movdqa xmm0,XMMWORD[$L$inc] - - pshufd xmm8,xmm8,0 - movdqa xmm7,xmm1 - movdqa xmm2,xmm1 - paddd xmm1,xmm0 - pcmpeqd xmm0,xmm8 - movdqa xmm3,xmm7 - paddd xmm2,xmm1 - pcmpeqd xmm1,xmm8 - movdqa xmm4,xmm7 - paddd xmm3,xmm2 - pcmpeqd xmm2,xmm8 - movdqa xmm5,xmm7 - paddd xmm4,xmm3 - pcmpeqd xmm3,xmm8 - movdqa xmm6,xmm7 - paddd xmm5,xmm4 - pcmpeqd xmm4,xmm8 - paddd xmm6,xmm5 - pcmpeqd xmm5,xmm8 - paddd xmm7,xmm6 - pcmpeqd xmm6,xmm8 - pcmpeqd xmm7,xmm8 - - movdqa xmm8,XMMWORD[rdx] - movdqa xmm9,XMMWORD[16+rdx] - movdqa xmm10,XMMWORD[32+rdx] - movdqa xmm11,XMMWORD[48+rdx] - pand xmm8,xmm0 - movdqa xmm12,XMMWORD[64+rdx] - pand xmm9,xmm1 - movdqa xmm13,XMMWORD[80+rdx] - pand xmm10,xmm2 - movdqa xmm14,XMMWORD[96+rdx] - pand xmm11,xmm3 - movdqa xmm15,XMMWORD[112+rdx] - lea rbp,[128+rdx] - pand xmm12,xmm4 - pand xmm13,xmm5 - pand xmm14,xmm6 - pand xmm15,xmm7 - por xmm8,xmm10 - por xmm9,xmm11 - por xmm8,xmm12 - por xmm9,xmm13 - por xmm8,xmm14 - por xmm9,xmm15 - - por xmm8,xmm9 - pshufd xmm9,xmm8,0x4e - por xmm8,xmm9 -DB 102,76,15,126,195 - - mov QWORD[128+rsp],r8 - mov QWORD[((128+8))+rsp],rdi - mov QWORD[((128+16))+rsp],rcx - - mov rax,QWORD[rsi] - mov rcx,QWORD[8+rsi] - mul rbx - mov QWORD[rsp],rax - mov rax,rcx - mov r8,rdx - - mul rbx - add r8,rax - mov rax,QWORD[16+rsi] - mov r9,rdx - adc r9,0 - - mul rbx - add r9,rax - mov rax,QWORD[24+rsi] - mov r10,rdx - adc r10,0 - - mul rbx - add r10,rax - mov rax,QWORD[32+rsi] - mov r11,rdx - adc r11,0 - - mul rbx - add r11,rax - mov rax,QWORD[40+rsi] - mov r12,rdx - adc r12,0 - - mul rbx - add r12,rax - mov rax,QWORD[48+rsi] - mov r13,rdx - adc r13,0 - - mul rbx - add r13,rax - mov rax,QWORD[56+rsi] - mov r14,rdx - adc r14,0 - - mul rbx - add r14,rax - mov rax,QWORD[rsi] - mov r15,rdx - adc r15,0 - - lea rdi,[8+rsp] - mov ecx,7 - jmp NEAR $L$oop_mul_gather - -ALIGN 32 -$L$oop_mul_gather: - movdqa xmm8,XMMWORD[rbp] - movdqa xmm9,XMMWORD[16+rbp] - movdqa xmm10,XMMWORD[32+rbp] - movdqa xmm11,XMMWORD[48+rbp] - pand xmm8,xmm0 - movdqa xmm12,XMMWORD[64+rbp] - pand xmm9,xmm1 - movdqa xmm13,XMMWORD[80+rbp] - pand xmm10,xmm2 - movdqa xmm14,XMMWORD[96+rbp] - pand xmm11,xmm3 - movdqa xmm15,XMMWORD[112+rbp] - lea rbp,[128+rbp] - pand xmm12,xmm4 - pand xmm13,xmm5 - pand xmm14,xmm6 - pand xmm15,xmm7 - por xmm8,xmm10 - por xmm9,xmm11 - por xmm8,xmm12 - por xmm9,xmm13 - por xmm8,xmm14 - por xmm9,xmm15 - - por xmm8,xmm9 - pshufd xmm9,xmm8,0x4e - por xmm8,xmm9 -DB 102,76,15,126,195 - - mul rbx - add r8,rax - mov rax,QWORD[8+rsi] - mov QWORD[rdi],r8 - mov r8,rdx - adc r8,0 - - mul rbx - add r9,rax - mov rax,QWORD[16+rsi] - adc rdx,0 - add r8,r9 - mov r9,rdx - adc r9,0 - - mul rbx - add r10,rax - mov rax,QWORD[24+rsi] - adc rdx,0 - add r9,r10 - mov r10,rdx - adc r10,0 - - mul rbx - add r11,rax - mov rax,QWORD[32+rsi] - adc rdx,0 - add r10,r11 - mov r11,rdx - adc r11,0 - - mul rbx - add r12,rax - mov rax,QWORD[40+rsi] - adc rdx,0 - add r11,r12 - mov r12,rdx - adc r12,0 - - mul rbx - add r13,rax - mov rax,QWORD[48+rsi] - adc rdx,0 - add r12,r13 - mov r13,rdx - adc r13,0 - - mul rbx - add r14,rax - mov rax,QWORD[56+rsi] - adc rdx,0 - add r13,r14 - mov r14,rdx - adc r14,0 - - mul rbx - add r15,rax - mov rax,QWORD[rsi] - adc rdx,0 - add r14,r15 - mov r15,rdx - adc r15,0 - - lea rdi,[8+rdi] - - dec ecx - jnz NEAR $L$oop_mul_gather - - mov QWORD[rdi],r8 - mov QWORD[8+rdi],r9 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - mov QWORD[32+rdi],r12 - mov QWORD[40+rdi],r13 - mov QWORD[48+rdi],r14 - mov QWORD[56+rdi],r15 - - mov rdi,QWORD[((128+8))+rsp] - mov rbp,QWORD[((128+16))+rsp] - - mov r8,QWORD[rsp] - mov r9,QWORD[8+rsp] - mov r10,QWORD[16+rsp] - mov r11,QWORD[24+rsp] - mov r12,QWORD[32+rsp] - mov r13,QWORD[40+rsp] - mov r14,QWORD[48+rsp] - mov r15,QWORD[56+rsp] - - call __rsaz_512_reduce - add r8,QWORD[64+rsp] - adc r9,QWORD[72+rsp] - adc r10,QWORD[80+rsp] - adc r11,QWORD[88+rsp] - adc r12,QWORD[96+rsp] - adc r13,QWORD[104+rsp] - adc r14,QWORD[112+rsp] - adc r15,QWORD[120+rsp] - sbb rcx,rcx - - call __rsaz_512_subtract - - lea rax,[((128+24+48))+rsp] - movaps xmm6,XMMWORD[((160-200))+rax] - movaps xmm7,XMMWORD[((176-200))+rax] - movaps xmm8,XMMWORD[((192-200))+rax] - movaps xmm9,XMMWORD[((208-200))+rax] - movaps xmm10,XMMWORD[((224-200))+rax] - movaps xmm11,XMMWORD[((240-200))+rax] - movaps xmm12,XMMWORD[((256-200))+rax] - movaps xmm13,XMMWORD[((272-200))+rax] - movaps xmm14,XMMWORD[((288-200))+rax] - movaps xmm15,XMMWORD[((304-200))+rax] - lea rax,[176+rax] - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - lea rsp,[rax] -$L$mul_gather4_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_rsaz_512_mul_gather4: -global rsaz_512_mul_scatter4 - -ALIGN 32 -rsaz_512_mul_scatter4: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_rsaz_512_mul_scatter4: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - - mov r9d,r9d - sub rsp,128+24 -$L$mul_scatter4_body: - lea r8,[r9*8+r8] -DB 102,72,15,110,199 -DB 102,72,15,110,202 -DB 102,73,15,110,208 - mov QWORD[128+rsp],rcx - - mov rbp,rdi - mov rbx,QWORD[rdi] - call __rsaz_512_mul - -DB 102,72,15,126,199 -DB 102,72,15,126,205 - - mov r8,QWORD[rsp] - mov r9,QWORD[8+rsp] - mov r10,QWORD[16+rsp] - mov r11,QWORD[24+rsp] - mov r12,QWORD[32+rsp] - mov r13,QWORD[40+rsp] - mov r14,QWORD[48+rsp] - mov r15,QWORD[56+rsp] - - call __rsaz_512_reduce - add r8,QWORD[64+rsp] - adc r9,QWORD[72+rsp] - adc r10,QWORD[80+rsp] - adc r11,QWORD[88+rsp] - adc r12,QWORD[96+rsp] - adc r13,QWORD[104+rsp] - adc r14,QWORD[112+rsp] - adc r15,QWORD[120+rsp] -DB 102,72,15,126,214 - sbb rcx,rcx - - call __rsaz_512_subtract - - mov QWORD[rsi],r8 - mov QWORD[128+rsi],r9 - mov QWORD[256+rsi],r10 - mov QWORD[384+rsi],r11 - mov QWORD[512+rsi],r12 - mov QWORD[640+rsi],r13 - mov QWORD[768+rsi],r14 - mov QWORD[896+rsi],r15 - - lea rax,[((128+24+48))+rsp] - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - lea rsp,[rax] -$L$mul_scatter4_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_rsaz_512_mul_scatter4: -global rsaz_512_mul_by_one - -ALIGN 32 -rsaz_512_mul_by_one: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_rsaz_512_mul_by_one: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - - - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - - sub rsp,128+24 -$L$mul_by_one_body: - mov rbp,rdx - mov QWORD[128+rsp],rcx - - mov r8,QWORD[rsi] - pxor xmm0,xmm0 - mov r9,QWORD[8+rsi] - mov r10,QWORD[16+rsi] - mov r11,QWORD[24+rsi] - mov r12,QWORD[32+rsi] - mov r13,QWORD[40+rsi] - mov r14,QWORD[48+rsi] - mov r15,QWORD[56+rsi] - - movdqa XMMWORD[rsp],xmm0 - movdqa XMMWORD[16+rsp],xmm0 - movdqa XMMWORD[32+rsp],xmm0 - movdqa XMMWORD[48+rsp],xmm0 - movdqa XMMWORD[64+rsp],xmm0 - movdqa XMMWORD[80+rsp],xmm0 - movdqa XMMWORD[96+rsp],xmm0 - call __rsaz_512_reduce - mov QWORD[rdi],r8 - mov QWORD[8+rdi],r9 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - mov QWORD[32+rdi],r12 - mov QWORD[40+rdi],r13 - mov QWORD[48+rdi],r14 - mov QWORD[56+rdi],r15 - - lea rax,[((128+24+48))+rsp] - mov r15,QWORD[((-48))+rax] - mov r14,QWORD[((-40))+rax] - mov r13,QWORD[((-32))+rax] - mov r12,QWORD[((-24))+rax] - mov rbp,QWORD[((-16))+rax] - mov rbx,QWORD[((-8))+rax] - lea rsp,[rax] -$L$mul_by_one_epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_rsaz_512_mul_by_one: - -ALIGN 32 -__rsaz_512_reduce: - mov rbx,r8 - imul rbx,QWORD[((128+8))+rsp] - mov rax,QWORD[rbp] - mov ecx,8 - jmp NEAR $L$reduction_loop - -ALIGN 32 -$L$reduction_loop: - mul rbx - mov rax,QWORD[8+rbp] - neg r8 - mov r8,rdx - adc r8,0 - - mul rbx - add r9,rax - mov rax,QWORD[16+rbp] - adc rdx,0 - add r8,r9 - mov r9,rdx - adc r9,0 - - mul rbx - add r10,rax - mov rax,QWORD[24+rbp] - adc rdx,0 - add r9,r10 - mov r10,rdx - adc r10,0 - - mul rbx - add r11,rax - mov rax,QWORD[32+rbp] - adc rdx,0 - add r10,r11 - mov rsi,QWORD[((128+8))+rsp] - - - adc rdx,0 - mov r11,rdx - - mul rbx - add r12,rax - mov rax,QWORD[40+rbp] - adc rdx,0 - imul rsi,r8 - add r11,r12 - mov r12,rdx - adc r12,0 - - mul rbx - add r13,rax - mov rax,QWORD[48+rbp] - adc rdx,0 - add r12,r13 - mov r13,rdx - adc r13,0 - - mul rbx - add r14,rax - mov rax,QWORD[56+rbp] - adc rdx,0 - add r13,r14 - mov r14,rdx - adc r14,0 - - mul rbx - mov rbx,rsi - add r15,rax - mov rax,QWORD[rbp] - adc rdx,0 - add r14,r15 - mov r15,rdx - adc r15,0 - - dec ecx - jne NEAR $L$reduction_loop - - DB 0F3h,0C3h ;repret - - -ALIGN 32 -__rsaz_512_subtract: - mov QWORD[rdi],r8 - mov QWORD[8+rdi],r9 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - mov QWORD[32+rdi],r12 - mov QWORD[40+rdi],r13 - mov QWORD[48+rdi],r14 - mov QWORD[56+rdi],r15 - - mov r8,QWORD[rbp] - mov r9,QWORD[8+rbp] - neg r8 - not r9 - and r8,rcx - mov r10,QWORD[16+rbp] - and r9,rcx - not r10 - mov r11,QWORD[24+rbp] - and r10,rcx - not r11 - mov r12,QWORD[32+rbp] - and r11,rcx - not r12 - mov r13,QWORD[40+rbp] - and r12,rcx - not r13 - mov r14,QWORD[48+rbp] - and r13,rcx - not r14 - mov r15,QWORD[56+rbp] - and r14,rcx - not r15 - and r15,rcx - - add r8,QWORD[rdi] - adc r9,QWORD[8+rdi] - adc r10,QWORD[16+rdi] - adc r11,QWORD[24+rdi] - adc r12,QWORD[32+rdi] - adc r13,QWORD[40+rdi] - adc r14,QWORD[48+rdi] - adc r15,QWORD[56+rdi] - - mov QWORD[rdi],r8 - mov QWORD[8+rdi],r9 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - mov QWORD[32+rdi],r12 - mov QWORD[40+rdi],r13 - mov QWORD[48+rdi],r14 - mov QWORD[56+rdi],r15 - - DB 0F3h,0C3h ;repret - - -ALIGN 32 -__rsaz_512_mul: - lea rdi,[8+rsp] - - mov rax,QWORD[rsi] - mul rbx - mov QWORD[rdi],rax - mov rax,QWORD[8+rsi] - mov r8,rdx - - mul rbx - add r8,rax - mov rax,QWORD[16+rsi] - mov r9,rdx - adc r9,0 - - mul rbx - add r9,rax - mov rax,QWORD[24+rsi] - mov r10,rdx - adc r10,0 - - mul rbx - add r10,rax - mov rax,QWORD[32+rsi] - mov r11,rdx - adc r11,0 - - mul rbx - add r11,rax - mov rax,QWORD[40+rsi] - mov r12,rdx - adc r12,0 - - mul rbx - add r12,rax - mov rax,QWORD[48+rsi] - mov r13,rdx - adc r13,0 - - mul rbx - add r13,rax - mov rax,QWORD[56+rsi] - mov r14,rdx - adc r14,0 - - mul rbx - add r14,rax - mov rax,QWORD[rsi] - mov r15,rdx - adc r15,0 - - lea rbp,[8+rbp] - lea rdi,[8+rdi] - - mov ecx,7 - jmp NEAR $L$oop_mul - -ALIGN 32 -$L$oop_mul: - mov rbx,QWORD[rbp] - mul rbx - add r8,rax - mov rax,QWORD[8+rsi] - mov QWORD[rdi],r8 - mov r8,rdx - adc r8,0 - - mul rbx - add r9,rax - mov rax,QWORD[16+rsi] - adc rdx,0 - add r8,r9 - mov r9,rdx - adc r9,0 - - mul rbx - add r10,rax - mov rax,QWORD[24+rsi] - adc rdx,0 - add r9,r10 - mov r10,rdx - adc r10,0 - - mul rbx - add r11,rax - mov rax,QWORD[32+rsi] - adc rdx,0 - add r10,r11 - mov r11,rdx - adc r11,0 - - mul rbx - add r12,rax - mov rax,QWORD[40+rsi] - adc rdx,0 - add r11,r12 - mov r12,rdx - adc r12,0 - - mul rbx - add r13,rax - mov rax,QWORD[48+rsi] - adc rdx,0 - add r12,r13 - mov r13,rdx - adc r13,0 - - mul rbx - add r14,rax - mov rax,QWORD[56+rsi] - adc rdx,0 - add r13,r14 - mov r14,rdx - lea rbp,[8+rbp] - adc r14,0 - - mul rbx - add r15,rax - mov rax,QWORD[rsi] - adc rdx,0 - add r14,r15 - mov r15,rdx - adc r15,0 - - lea rdi,[8+rdi] - - dec ecx - jnz NEAR $L$oop_mul - - mov QWORD[rdi],r8 - mov QWORD[8+rdi],r9 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - mov QWORD[32+rdi],r12 - mov QWORD[40+rdi],r13 - mov QWORD[48+rdi],r14 - mov QWORD[56+rdi],r15 - - DB 0F3h,0C3h ;repret - -global rsaz_512_scatter4 - -ALIGN 16 -rsaz_512_scatter4: - lea rcx,[r8*8+rcx] - mov r9d,8 - jmp NEAR $L$oop_scatter -ALIGN 16 -$L$oop_scatter: - mov rax,QWORD[rdx] - lea rdx,[8+rdx] - mov QWORD[rcx],rax - lea rcx,[128+rcx] - dec r9d - jnz NEAR $L$oop_scatter - DB 0F3h,0C3h ;repret - - -global rsaz_512_gather4 - -ALIGN 16 -rsaz_512_gather4: -$L$SEH_begin_rsaz_512_gather4: -DB 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 -DB 0x0f,0x29,0x34,0x24 -DB 0x0f,0x29,0x7c,0x24,0x10 -DB 0x44,0x0f,0x29,0x44,0x24,0x20 -DB 0x44,0x0f,0x29,0x4c,0x24,0x30 -DB 0x44,0x0f,0x29,0x54,0x24,0x40 -DB 0x44,0x0f,0x29,0x5c,0x24,0x50 -DB 0x44,0x0f,0x29,0x64,0x24,0x60 -DB 0x44,0x0f,0x29,0x6c,0x24,0x70 -DB 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 -DB 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 - movd xmm8,r8d - movdqa xmm1,XMMWORD[(($L$inc+16))] - movdqa xmm0,XMMWORD[$L$inc] - - pshufd xmm8,xmm8,0 - movdqa xmm7,xmm1 - movdqa xmm2,xmm1 - paddd xmm1,xmm0 - pcmpeqd xmm0,xmm8 - movdqa xmm3,xmm7 - paddd xmm2,xmm1 - pcmpeqd xmm1,xmm8 - movdqa xmm4,xmm7 - paddd xmm3,xmm2 - pcmpeqd xmm2,xmm8 - movdqa xmm5,xmm7 - paddd xmm4,xmm3 - pcmpeqd xmm3,xmm8 - movdqa xmm6,xmm7 - paddd xmm5,xmm4 - pcmpeqd xmm4,xmm8 - paddd xmm6,xmm5 - pcmpeqd xmm5,xmm8 - paddd xmm7,xmm6 - pcmpeqd xmm6,xmm8 - pcmpeqd xmm7,xmm8 - mov r9d,8 - jmp NEAR $L$oop_gather -ALIGN 16 -$L$oop_gather: - movdqa xmm8,XMMWORD[rdx] - movdqa xmm9,XMMWORD[16+rdx] - movdqa xmm10,XMMWORD[32+rdx] - movdqa xmm11,XMMWORD[48+rdx] - pand xmm8,xmm0 - movdqa xmm12,XMMWORD[64+rdx] - pand xmm9,xmm1 - movdqa xmm13,XMMWORD[80+rdx] - pand xmm10,xmm2 - movdqa xmm14,XMMWORD[96+rdx] - pand xmm11,xmm3 - movdqa xmm15,XMMWORD[112+rdx] - lea rdx,[128+rdx] - pand xmm12,xmm4 - pand xmm13,xmm5 - pand xmm14,xmm6 - pand xmm15,xmm7 - por xmm8,xmm10 - por xmm9,xmm11 - por xmm8,xmm12 - por xmm9,xmm13 - por xmm8,xmm14 - por xmm9,xmm15 - - por xmm8,xmm9 - pshufd xmm9,xmm8,0x4e - por xmm8,xmm9 - movq QWORD[rcx],xmm8 - lea rcx,[8+rcx] - dec r9d - jnz NEAR $L$oop_gather - movaps xmm6,XMMWORD[rsp] - movaps xmm7,XMMWORD[16+rsp] - movaps xmm8,XMMWORD[32+rsp] - movaps xmm9,XMMWORD[48+rsp] - movaps xmm10,XMMWORD[64+rsp] - movaps xmm11,XMMWORD[80+rsp] - movaps xmm12,XMMWORD[96+rsp] - movaps xmm13,XMMWORD[112+rsp] - movaps xmm14,XMMWORD[128+rsp] - movaps xmm15,XMMWORD[144+rsp] - add rsp,0xa8 - DB 0F3h,0C3h ;repret -$L$SEH_end_rsaz_512_gather4: - - -ALIGN 64 -$L$inc: - DD 0,0,1,1 - DD 2,2,2,2 -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - mov rsi,QWORD[8+r9] - mov r11,QWORD[56+r9] - - mov r10d,DWORD[r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jb NEAR $L$common_seh_tail - - mov rax,QWORD[152+r8] - - mov r10d,DWORD[4+r11] - lea r10,[r10*1+rsi] - cmp rbx,r10 - jae NEAR $L$common_seh_tail - - lea rax,[((128+24+48))+rax] - - lea rbx,[$L$mul_gather4_epilogue] - cmp rbx,r10 - jne NEAR $L$se_not_in_mul_gather4 - - lea rax,[176+rax] - - lea rsi,[((-48-168))+rax] - lea rdi,[512+r8] - mov ecx,20 - DD 0xa548f3fc - -$L$se_not_in_mul_gather4: - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - mov r15,QWORD[((-48))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - -$L$common_seh_tail: - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[152+r8],rax - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret - - -section .pdata rdata align=4 -ALIGN 4 - DD $L$SEH_begin_rsaz_512_sqr wrt ..imagebase - DD $L$SEH_end_rsaz_512_sqr wrt ..imagebase - DD $L$SEH_info_rsaz_512_sqr wrt ..imagebase - - DD $L$SEH_begin_rsaz_512_mul wrt ..imagebase - DD $L$SEH_end_rsaz_512_mul wrt ..imagebase - DD $L$SEH_info_rsaz_512_mul wrt ..imagebase - - DD $L$SEH_begin_rsaz_512_mul_gather4 wrt ..imagebase - DD $L$SEH_end_rsaz_512_mul_gather4 wrt ..imagebase - DD $L$SEH_info_rsaz_512_mul_gather4 wrt ..imagebase - - DD $L$SEH_begin_rsaz_512_mul_scatter4 wrt ..imagebase - DD $L$SEH_end_rsaz_512_mul_scatter4 wrt ..imagebase - DD $L$SEH_info_rsaz_512_mul_scatter4 wrt ..imagebase - - DD $L$SEH_begin_rsaz_512_mul_by_one wrt ..imagebase - DD $L$SEH_end_rsaz_512_mul_by_one wrt ..imagebase - DD $L$SEH_info_rsaz_512_mul_by_one wrt ..imagebase - - DD $L$SEH_begin_rsaz_512_gather4 wrt ..imagebase - DD $L$SEH_end_rsaz_512_gather4 wrt ..imagebase - DD $L$SEH_info_rsaz_512_gather4 wrt ..imagebase - -section .xdata rdata align=8 -ALIGN 8 -$L$SEH_info_rsaz_512_sqr: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase -$L$SEH_info_rsaz_512_mul: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase -$L$SEH_info_rsaz_512_mul_gather4: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$mul_gather4_body wrt ..imagebase,$L$mul_gather4_epilogue wrt ..imagebase -$L$SEH_info_rsaz_512_mul_scatter4: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$mul_scatter4_body wrt ..imagebase,$L$mul_scatter4_epilogue wrt ..imagebase -$L$SEH_info_rsaz_512_mul_by_one: -DB 9,0,0,0 - DD se_handler wrt ..imagebase - DD $L$mul_by_one_body wrt ..imagebase,$L$mul_by_one_epilogue wrt ..imagebase -$L$SEH_info_rsaz_512_gather4: -DB 0x01,0x46,0x16,0x00 -DB 0x46,0xf8,0x09,0x00 -DB 0x3d,0xe8,0x08,0x00 -DB 0x34,0xd8,0x07,0x00 -DB 0x2e,0xc8,0x06,0x00 -DB 0x28,0xb8,0x05,0x00 -DB 0x22,0xa8,0x04,0x00 -DB 0x1c,0x98,0x03,0x00 -DB 0x16,0x88,0x02,0x00 -DB 0x10,0x78,0x01,0x00 -DB 0x0b,0x68,0x00,0x00 -DB 0x07,0x01,0x15,0x00 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm index 0ecbe95682..cb36246891 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/chacha/chacha-x86_64.asm @@ -27,6 +27,15 @@ DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe $L$sigma: DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 DB 0 +ALIGN 64 +$L$zeroz: + DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 +$L$fourz: + DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 +$L$incz: + DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +$L$sixteen: + DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 @@ -59,6 +68,7 @@ $L$SEH_begin_ChaCha20_ctr32: push r14 push r15 sub rsp,64+24 +$L$ctr32_body: movdqu xmm1,XMMWORD[rcx] @@ -296,13 +306,14 @@ $L$oop_tail: jnz NEAR $L$oop_tail $L$done: - add rsp,64+24 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx + lea rsi,[((64+24+48))+rsp] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$no_data: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -323,20 +334,15 @@ $L$SEH_begin_ChaCha20_ssse3: $L$ChaCha20_ssse3: + mov r9,rsp cmp rdx,128 ja NEAR $L$ChaCha20_4x $L$do_sse3_after_all: - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - - sub rsp,64+72 - movaps XMMWORD[(64+32)+rsp],xmm6 - movaps XMMWORD[(64+48)+rsp],xmm7 + sub rsp,64+40 + movaps XMMWORD[(-40)+r9],xmm6 + movaps XMMWORD[(-24)+r9],xmm7 +$L$ssse3_body: movdqa xmm0,XMMWORD[$L$sigma] movdqu xmm1,XMMWORD[rcx] movdqu xmm2,XMMWORD[16+rcx] @@ -348,7 +354,7 @@ $L$do_sse3_after_all: movdqa XMMWORD[16+rsp],xmm1 movdqa XMMWORD[32+rsp],xmm2 movdqa XMMWORD[48+rsp],xmm3 - mov ebp,10 + mov r8,10 jmp NEAR $L$oop_ssse3 ALIGN 32 @@ -358,7 +364,7 @@ $L$oop_outer_ssse3: movdqa xmm1,XMMWORD[16+rsp] movdqa xmm2,XMMWORD[32+rsp] paddd xmm3,XMMWORD[48+rsp] - mov ebp,10 + mov r8,10 movdqa XMMWORD[48+rsp],xmm3 jmp NEAR $L$oop_ssse3 @@ -407,7 +413,7 @@ DB 102,15,56,0,223 pshufd xmm2,xmm2,78 pshufd xmm1,xmm1,147 pshufd xmm3,xmm3,57 - dec ebp + dec r8 jnz NEAR $L$oop_ssse3 paddd xmm0,XMMWORD[rsp] paddd xmm1,XMMWORD[16+rsp] @@ -444,27 +450,22 @@ $L$tail_ssse3: movdqa XMMWORD[16+rsp],xmm1 movdqa XMMWORD[32+rsp],xmm2 movdqa XMMWORD[48+rsp],xmm3 - xor rbx,rbx + xor r8,r8 $L$oop_tail_ssse3: - movzx eax,BYTE[rbx*1+rsi] - movzx ecx,BYTE[rbx*1+rsp] - lea rbx,[1+rbx] + movzx eax,BYTE[r8*1+rsi] + movzx ecx,BYTE[r8*1+rsp] + lea r8,[1+r8] xor eax,ecx - mov BYTE[((-1))+rbx*1+rdi],al + mov BYTE[((-1))+r8*1+rdi],al dec rdx jnz NEAR $L$oop_tail_ssse3 $L$done_ssse3: - movaps xmm6,XMMWORD[((64+32))+rsp] - movaps xmm7,XMMWORD[((64+48))+rsp] - add rsp,64+72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx + movaps xmm6,XMMWORD[((-40))+r9] + movaps xmm7,XMMWORD[((-24))+r9] + lea rsp,[r9] +$L$ssse3_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret @@ -484,6 +485,7 @@ $L$SEH_begin_ChaCha20_4x: $L$ChaCha20_4x: + mov r9,rsp mov r11,r10 shr r10,32 test r10,32 @@ -496,18 +498,18 @@ $L$ChaCha20_4x: je NEAR $L$do_sse3_after_all $L$proceed4x: - lea r11,[((-120))+rsp] - sub rsp,0x148+160 - movaps XMMWORD[(-48)+r11],xmm6 - movaps XMMWORD[(-32)+r11],xmm7 - movaps XMMWORD[(-16)+r11],xmm8 - movaps XMMWORD[r11],xmm9 - movaps XMMWORD[16+r11],xmm10 - movaps XMMWORD[32+r11],xmm11 - movaps XMMWORD[48+r11],xmm12 - movaps XMMWORD[64+r11],xmm13 - movaps XMMWORD[80+r11],xmm14 - movaps XMMWORD[96+r11],xmm15 + sub rsp,0x140+168 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$4x_body: movdqa xmm11,XMMWORD[$L$sigma] movdqu xmm15,XMMWORD[rcx] movdqu xmm7,XMMWORD[16+rcx] @@ -1034,18 +1036,18 @@ $L$oop_tail4x: jnz NEAR $L$oop_tail4x $L$done4x: - lea r11,[((320+48))+rsp] - movaps xmm6,XMMWORD[((-48))+r11] - movaps xmm7,XMMWORD[((-32))+r11] - movaps xmm8,XMMWORD[((-16))+r11] - movaps xmm9,XMMWORD[r11] - movaps xmm10,XMMWORD[16+r11] - movaps xmm11,XMMWORD[32+r11] - movaps xmm12,XMMWORD[48+r11] - movaps xmm13,XMMWORD[64+r11] - movaps xmm14,XMMWORD[80+r11] - movaps xmm15,XMMWORD[96+r11] - add rsp,0x148+160 + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] +$L$4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret @@ -1065,22 +1067,21 @@ $L$SEH_begin_ChaCha20_8x: $L$ChaCha20_8x: - mov r10,rsp - sub rsp,0x280+176 + mov r9,rsp + sub rsp,0x280+168 and rsp,-32 - lea r11,[((656+48))+rsp] - movaps XMMWORD[(-48)+r11],xmm6 - movaps XMMWORD[(-32)+r11],xmm7 - movaps XMMWORD[(-16)+r11],xmm8 - movaps XMMWORD[r11],xmm9 - movaps XMMWORD[16+r11],xmm10 - movaps XMMWORD[32+r11],xmm11 - movaps XMMWORD[48+r11],xmm12 - movaps XMMWORD[64+r11],xmm13 - movaps XMMWORD[80+r11],xmm14 - movaps XMMWORD[96+r11],xmm15 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$8x_body: vzeroupper - mov QWORD[640+rsp],r10 @@ -1091,10 +1092,10 @@ $L$ChaCha20_8x: - vbroadcasti128 ymm11,YMMWORD[$L$sigma] - vbroadcasti128 ymm3,YMMWORD[rcx] - vbroadcasti128 ymm15,YMMWORD[16+rcx] - vbroadcasti128 ymm7,YMMWORD[r8] + vbroadcasti128 ymm11,XMMWORD[$L$sigma] + vbroadcasti128 ymm3,XMMWORD[rcx] + vbroadcasti128 ymm15,XMMWORD[16+rcx] + vbroadcasti128 ymm7,XMMWORD[r8] lea rcx,[256+rsp] lea rax,[512+rsp] lea r10,[$L$rot16] @@ -1161,7 +1162,7 @@ $L$oop_outer8x: $L$oop_enter8x: vmovdqa YMMWORD[64+rsp],ymm14 vmovdqa YMMWORD[96+rsp],ymm15 - vbroadcasti128 ymm15,YMMWORD[r10] + vbroadcasti128 ymm15,XMMWORD[r10] vmovdqa YMMWORD[(512-512)+rax],ymm4 mov eax,10 jmp NEAR $L$oop8x @@ -1179,7 +1180,7 @@ $L$oop8x: vpslld ymm14,ymm0,12 vpsrld ymm0,ymm0,20 vpor ymm0,ymm14,ymm0 - vbroadcasti128 ymm14,YMMWORD[r11] + vbroadcasti128 ymm14,XMMWORD[r11] vpaddd ymm13,ymm13,ymm5 vpxor ymm1,ymm13,ymm1 vpslld ymm15,ymm1,12 @@ -1196,7 +1197,7 @@ $L$oop8x: vpslld ymm15,ymm0,7 vpsrld ymm0,ymm0,25 vpor ymm0,ymm15,ymm0 - vbroadcasti128 ymm15,YMMWORD[r10] + vbroadcasti128 ymm15,XMMWORD[r10] vpaddd ymm13,ymm13,ymm5 vpxor ymm1,ymm13,ymm1 vpslld ymm14,ymm1,7 @@ -1217,7 +1218,7 @@ $L$oop8x: vpslld ymm14,ymm2,12 vpsrld ymm2,ymm2,20 vpor ymm2,ymm14,ymm2 - vbroadcasti128 ymm14,YMMWORD[r11] + vbroadcasti128 ymm14,XMMWORD[r11] vpaddd ymm13,ymm13,ymm7 vpxor ymm3,ymm13,ymm3 vpslld ymm15,ymm3,12 @@ -1234,7 +1235,7 @@ $L$oop8x: vpslld ymm15,ymm2,7 vpsrld ymm2,ymm2,25 vpor ymm2,ymm15,ymm2 - vbroadcasti128 ymm15,YMMWORD[r10] + vbroadcasti128 ymm15,XMMWORD[r10] vpaddd ymm13,ymm13,ymm7 vpxor ymm3,ymm13,ymm3 vpslld ymm14,ymm3,7 @@ -1251,7 +1252,7 @@ $L$oop8x: vpslld ymm14,ymm1,12 vpsrld ymm1,ymm1,20 vpor ymm1,ymm14,ymm1 - vbroadcasti128 ymm14,YMMWORD[r11] + vbroadcasti128 ymm14,XMMWORD[r11] vpaddd ymm13,ymm13,ymm4 vpxor ymm2,ymm13,ymm2 vpslld ymm15,ymm2,12 @@ -1268,7 +1269,7 @@ $L$oop8x: vpslld ymm15,ymm1,7 vpsrld ymm1,ymm1,25 vpor ymm1,ymm15,ymm1 - vbroadcasti128 ymm15,YMMWORD[r10] + vbroadcasti128 ymm15,XMMWORD[r10] vpaddd ymm13,ymm13,ymm4 vpxor ymm2,ymm13,ymm2 vpslld ymm14,ymm2,7 @@ -1289,7 +1290,7 @@ $L$oop8x: vpslld ymm14,ymm3,12 vpsrld ymm3,ymm3,20 vpor ymm3,ymm14,ymm3 - vbroadcasti128 ymm14,YMMWORD[r11] + vbroadcasti128 ymm14,XMMWORD[r11] vpaddd ymm13,ymm13,ymm6 vpxor ymm0,ymm13,ymm0 vpslld ymm15,ymm0,12 @@ -1306,7 +1307,7 @@ $L$oop8x: vpslld ymm15,ymm3,7 vpsrld ymm3,ymm3,25 vpor ymm3,ymm15,ymm3 - vbroadcasti128 ymm15,YMMWORD[r10] + vbroadcasti128 ymm15,XMMWORD[r10] vpaddd ymm13,ymm13,ymm6 vpxor ymm0,ymm13,ymm0 vpslld ymm14,ymm0,7 @@ -1671,19 +1672,220 @@ $L$oop_tail8x: $L$done8x: vzeroall - lea r11,[((656+48))+rsp] - movaps xmm6,XMMWORD[((-48))+r11] - movaps xmm7,XMMWORD[((-32))+r11] - movaps xmm8,XMMWORD[((-16))+r11] - movaps xmm9,XMMWORD[r11] - movaps xmm10,XMMWORD[16+r11] - movaps xmm11,XMMWORD[32+r11] - movaps xmm12,XMMWORD[48+r11] - movaps xmm13,XMMWORD[64+r11] - movaps xmm14,XMMWORD[80+r11] - movaps xmm15,XMMWORD[96+r11] - mov rsp,QWORD[640+rsp] + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] +$L$8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_ChaCha20_8x: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + lea r10,[$L$ctr32_body] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea r10,[$L$no_data] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[((64+24+48))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + + +ALIGN 16 +ssse3_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-40))+rax] + lea rdi,[512+r8] + mov ecx,4 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ChaCha20_ctr32 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_ssse3 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ssse3 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ssse3 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_4x wrt ..imagebase + DD $L$SEH_end_ChaCha20_4x wrt ..imagebase + DD $L$SEH_info_ChaCha20_4x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_8x wrt ..imagebase + DD $L$SEH_end_ChaCha20_8x wrt ..imagebase + DD $L$SEH_info_ChaCha20_8x wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ChaCha20_ctr32: +DB 9,0,0,0 + DD se_handler wrt ..imagebase + +$L$SEH_info_ChaCha20_ssse3: +DB 9,0,0,0 + DD ssse3_handler wrt ..imagebase + DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase + +$L$SEH_info_ChaCha20_4x: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase +$L$SEH_info_ChaCha20_8x: +DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm new file mode 100644 index 0000000000..56dc2060a4 --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm @@ -0,0 +1,3270 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .data data align=8 + + +ALIGN 16 +one: + DQ 1,0 +two: + DQ 2,0 +three: + DQ 3,0 +four: + DQ 4,0 +five: + DQ 5,0 +six: + DQ 6,0 +seven: + DQ 7,0 +eight: + DQ 8,0 + +OR_MASK: + DD 0x00000000,0x00000000,0x00000000,0x80000000 +poly: + DQ 0x1,0xc200000000000000 +mask: + DD 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +con1: + DD 1,1,1,1 +con2: + DD 0x1b,0x1b,0x1b,0x1b +con3: +DB -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 +and_mask: + DD 0,0xffffffff,0xffffffff,0xffffffff +section .text code align=64 + + +ALIGN 16 +GFMUL: + + vpclmulqdq xmm2,xmm0,xmm1,0x00 + vpclmulqdq xmm5,xmm0,xmm1,0x11 + vpclmulqdq xmm3,xmm0,xmm1,0x10 + vpclmulqdq xmm4,xmm0,xmm1,0x01 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm3,8 + vpsrldq xmm3,xmm3,8 + vpxor xmm2,xmm2,xmm4 + vpxor xmm5,xmm5,xmm3 + + vpclmulqdq xmm3,xmm2,XMMWORD[poly],0x10 + vpshufd xmm4,xmm2,78 + vpxor xmm2,xmm3,xmm4 + + vpclmulqdq xmm3,xmm2,XMMWORD[poly],0x10 + vpshufd xmm4,xmm2,78 + vpxor xmm2,xmm3,xmm4 + + vpxor xmm0,xmm2,xmm5 + DB 0F3h,0C3h ;repret + + +global aesgcmsiv_htable_init + +ALIGN 16 +aesgcmsiv_htable_init: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesgcmsiv_htable_init: + mov rdi,rcx + mov rsi,rdx + + + + vmovdqa xmm0,XMMWORD[rsi] + vmovdqa xmm1,xmm0 + vmovdqa XMMWORD[rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[16+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[32+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[48+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[64+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[80+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[96+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[112+rdi],xmm0 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aesgcmsiv_htable_init: +global aesgcmsiv_htable6_init + +ALIGN 16 +aesgcmsiv_htable6_init: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesgcmsiv_htable6_init: + mov rdi,rcx + mov rsi,rdx + + + + vmovdqa xmm0,XMMWORD[rsi] + vmovdqa xmm1,xmm0 + vmovdqa XMMWORD[rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[16+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[32+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[48+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[64+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[80+rdi],xmm0 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aesgcmsiv_htable6_init: +global aesgcmsiv_htable_polyval + +ALIGN 16 +aesgcmsiv_htable_polyval: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesgcmsiv_htable_polyval: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + test rdx,rdx + jnz NEAR $L$htable_polyval_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$htable_polyval_start: + vzeroall + + + + mov r11,rdx + and r11,127 + + jz NEAR $L$htable_polyval_no_prefix + + vpxor xmm9,xmm9,xmm9 + vmovdqa xmm1,XMMWORD[rcx] + sub rdx,r11 + + sub r11,16 + + + vmovdqu xmm0,XMMWORD[rsi] + vpxor xmm0,xmm0,xmm1 + + vpclmulqdq xmm5,xmm0,XMMWORD[r11*1+rdi],0x01 + vpclmulqdq xmm3,xmm0,XMMWORD[r11*1+rdi],0x00 + vpclmulqdq xmm4,xmm0,XMMWORD[r11*1+rdi],0x11 + vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + lea rsi,[16+rsi] + test r11,r11 + jnz NEAR $L$htable_polyval_prefix_loop + jmp NEAR $L$htable_polyval_prefix_complete + + +ALIGN 64 +$L$htable_polyval_prefix_loop: + sub r11,16 + + vmovdqu xmm0,XMMWORD[rsi] + + vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + test r11,r11 + + lea rsi,[16+rsi] + + jnz NEAR $L$htable_polyval_prefix_loop + +$L$htable_polyval_prefix_complete: + vpsrldq xmm6,xmm5,8 + vpslldq xmm5,xmm5,8 + + vpxor xmm9,xmm4,xmm6 + vpxor xmm1,xmm3,xmm5 + + jmp NEAR $L$htable_polyval_main_loop + +$L$htable_polyval_no_prefix: + + + + + vpxor xmm1,xmm1,xmm1 + vmovdqa xmm9,XMMWORD[rcx] + +ALIGN 64 +$L$htable_polyval_main_loop: + sub rdx,0x80 + jb NEAR $L$htable_polyval_out + + vmovdqu xmm0,XMMWORD[112+rsi] + + vpclmulqdq xmm5,xmm0,XMMWORD[rdi],0x01 + vpclmulqdq xmm3,xmm0,XMMWORD[rdi],0x00 + vpclmulqdq xmm4,xmm0,XMMWORD[rdi],0x11 + vpclmulqdq xmm6,xmm0,XMMWORD[rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vmovdqu xmm0,XMMWORD[96+rsi] + vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + + vmovdqu xmm0,XMMWORD[80+rsi] + + vpclmulqdq xmm7,xmm1,XMMWORD[poly],0x10 + vpalignr xmm1,xmm1,xmm1,8 + + vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vpxor xmm1,xmm1,xmm7 + + vmovdqu xmm0,XMMWORD[64+rsi] + + vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vmovdqu xmm0,XMMWORD[48+rsi] + + vpclmulqdq xmm7,xmm1,XMMWORD[poly],0x10 + vpalignr xmm1,xmm1,xmm1,8 + + vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vpxor xmm1,xmm1,xmm7 + + vmovdqu xmm0,XMMWORD[32+rsi] + + vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vpxor xmm1,xmm1,xmm9 + + vmovdqu xmm0,XMMWORD[16+rsi] + + vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vmovdqu xmm0,XMMWORD[rsi] + vpxor xmm0,xmm0,xmm1 + + vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vpsrldq xmm6,xmm5,8 + vpslldq xmm5,xmm5,8 + + vpxor xmm9,xmm4,xmm6 + vpxor xmm1,xmm3,xmm5 + + lea rsi,[128+rsi] + jmp NEAR $L$htable_polyval_main_loop + + + +$L$htable_polyval_out: + vpclmulqdq xmm6,xmm1,XMMWORD[poly],0x10 + vpalignr xmm1,xmm1,xmm1,8 + vpxor xmm1,xmm1,xmm6 + + vpclmulqdq xmm6,xmm1,XMMWORD[poly],0x10 + vpalignr xmm1,xmm1,xmm1,8 + vpxor xmm1,xmm1,xmm6 + vpxor xmm1,xmm1,xmm9 + + vmovdqu XMMWORD[rcx],xmm1 + vzeroupper + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aesgcmsiv_htable_polyval: +global aesgcmsiv_polyval_horner + +ALIGN 16 +aesgcmsiv_polyval_horner: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesgcmsiv_polyval_horner: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + test rcx,rcx + jnz NEAR $L$polyval_horner_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$polyval_horner_start: + + + + xor r10,r10 + shl rcx,4 + + vmovdqa xmm1,XMMWORD[rsi] + vmovdqa xmm0,XMMWORD[rdi] + +$L$polyval_horner_loop: + vpxor xmm0,xmm0,XMMWORD[r10*1+rdx] + call GFMUL + + add r10,16 + cmp rcx,r10 + jne NEAR $L$polyval_horner_loop + + + vmovdqa XMMWORD[rdi],xmm0 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aesgcmsiv_polyval_horner: +global aes128gcmsiv_aes_ks + +ALIGN 16 +aes128gcmsiv_aes_ks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_aes_ks: + mov rdi,rcx + mov rsi,rdx + + + + vmovdqu xmm1,XMMWORD[rdi] + vmovdqa XMMWORD[rsi],xmm1 + + vmovdqa xmm0,XMMWORD[con1] + vmovdqa xmm15,XMMWORD[mask] + + mov rax,8 + +$L$ks128_loop: + add rsi,16 + sub rax,1 + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm3,xmm1,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + vmovdqa XMMWORD[rsi],xmm1 + jne NEAR $L$ks128_loop + + vmovdqa xmm0,XMMWORD[con2] + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm3,xmm1,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + vmovdqa XMMWORD[16+rsi],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslldq xmm3,xmm1,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + vmovdqa XMMWORD[32+rsi],xmm1 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes128gcmsiv_aes_ks: +global aes256gcmsiv_aes_ks + +ALIGN 16 +aes256gcmsiv_aes_ks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_aes_ks: + mov rdi,rcx + mov rsi,rdx + + + + vmovdqu xmm1,XMMWORD[rdi] + vmovdqu xmm3,XMMWORD[16+rdi] + vmovdqa XMMWORD[rsi],xmm1 + vmovdqa XMMWORD[16+rsi],xmm3 + vmovdqa xmm0,XMMWORD[con1] + vmovdqa xmm15,XMMWORD[mask] + vpxor xmm14,xmm14,xmm14 + mov rax,6 + +$L$ks256_loop: + add rsi,32 + sub rax,1 + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm4,xmm1,32 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm4,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vmovdqa XMMWORD[rsi],xmm1 + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpsllq xmm4,xmm3,32 + vpxor xmm3,xmm3,xmm4 + vpshufb xmm4,xmm3,XMMWORD[con3] + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vmovdqa XMMWORD[16+rsi],xmm3 + jne NEAR $L$ks256_loop + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpsllq xmm4,xmm1,32 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm4,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vmovdqa XMMWORD[32+rsi],xmm1 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +global aes128gcmsiv_aes_ks_enc_x1 + +ALIGN 16 +aes128gcmsiv_aes_ks_enc_x1: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_aes_ks_enc_x1: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + vmovdqa xmm1,XMMWORD[rcx] + vmovdqa xmm4,XMMWORD[rdi] + + vmovdqa XMMWORD[rdx],xmm1 + vpxor xmm4,xmm4,xmm1 + + vmovdqa xmm0,XMMWORD[con1] + vmovdqa xmm15,XMMWORD[mask] + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[16+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[32+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[48+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[64+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[80+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[96+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[112+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[128+rdx],xmm1 + + + vmovdqa xmm0,XMMWORD[con2] + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[144+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenclast xmm4,xmm4,xmm1 + vmovdqa XMMWORD[160+rdx],xmm1 + + + vmovdqa XMMWORD[rsi],xmm4 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes128gcmsiv_aes_ks_enc_x1: +global aes128gcmsiv_kdf + +ALIGN 16 +aes128gcmsiv_kdf: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_kdf: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + + + + + vmovdqa xmm1,XMMWORD[rdx] + vmovdqa xmm9,XMMWORD[rdi] + vmovdqa xmm12,XMMWORD[and_mask] + vmovdqa xmm13,XMMWORD[one] + vpshufd xmm9,xmm9,0x90 + vpand xmm9,xmm9,xmm12 + vpaddd xmm10,xmm9,xmm13 + vpaddd xmm11,xmm10,xmm13 + vpaddd xmm12,xmm11,xmm13 + + vpxor xmm9,xmm9,xmm1 + vpxor xmm10,xmm10,xmm1 + vpxor xmm11,xmm11,xmm1 + vpxor xmm12,xmm12,xmm1 + + vmovdqa xmm1,XMMWORD[16+rdx] + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + + vmovdqa xmm2,XMMWORD[32+rdx] + vaesenc xmm9,xmm9,xmm2 + vaesenc xmm10,xmm10,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + + vmovdqa xmm1,XMMWORD[48+rdx] + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + + vmovdqa xmm2,XMMWORD[64+rdx] + vaesenc xmm9,xmm9,xmm2 + vaesenc xmm10,xmm10,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + + vmovdqa xmm1,XMMWORD[80+rdx] + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + + vmovdqa xmm2,XMMWORD[96+rdx] + vaesenc xmm9,xmm9,xmm2 + vaesenc xmm10,xmm10,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + + vmovdqa xmm1,XMMWORD[112+rdx] + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + + vmovdqa xmm2,XMMWORD[128+rdx] + vaesenc xmm9,xmm9,xmm2 + vaesenc xmm10,xmm10,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + + vmovdqa xmm1,XMMWORD[144+rdx] + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + + vmovdqa xmm2,XMMWORD[160+rdx] + vaesenclast xmm9,xmm9,xmm2 + vaesenclast xmm10,xmm10,xmm2 + vaesenclast xmm11,xmm11,xmm2 + vaesenclast xmm12,xmm12,xmm2 + + + vmovdqa XMMWORD[rsi],xmm9 + vmovdqa XMMWORD[16+rsi],xmm10 + vmovdqa XMMWORD[32+rsi],xmm11 + vmovdqa XMMWORD[48+rsi],xmm12 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes128gcmsiv_kdf: +global aes128gcmsiv_enc_msg_x4 + +ALIGN 16 +aes128gcmsiv_enc_msg_x4: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_enc_msg_x4: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + test r8,r8 + jnz NEAR $L$128_enc_msg_x4_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$128_enc_msg_x4_start: + push r12 + + push r13 + + + shr r8,4 + mov r10,r8 + shl r10,62 + shr r10,62 + + + vmovdqa xmm15,XMMWORD[rdx] + vpor xmm15,xmm15,XMMWORD[OR_MASK] + + vmovdqu xmm4,XMMWORD[four] + vmovdqa xmm0,xmm15 + vpaddd xmm1,xmm15,XMMWORD[one] + vpaddd xmm2,xmm15,XMMWORD[two] + vpaddd xmm3,xmm15,XMMWORD[three] + + shr r8,2 + je NEAR $L$128_enc_msg_x4_check_remainder + + sub rsi,64 + sub rdi,64 + +$L$128_enc_msg_x4_loop1: + add rsi,64 + add rdi,64 + + vmovdqa xmm5,xmm0 + vmovdqa xmm6,xmm1 + vmovdqa xmm7,xmm2 + vmovdqa xmm8,xmm3 + + vpxor xmm5,xmm5,XMMWORD[rcx] + vpxor xmm6,xmm6,XMMWORD[rcx] + vpxor xmm7,xmm7,XMMWORD[rcx] + vpxor xmm8,xmm8,XMMWORD[rcx] + + vmovdqu xmm12,XMMWORD[16+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm0,xmm0,xmm4 + vmovdqu xmm12,XMMWORD[32+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm1,xmm1,xmm4 + vmovdqu xmm12,XMMWORD[48+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm2,xmm2,xmm4 + vmovdqu xmm12,XMMWORD[64+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm3,xmm3,xmm4 + + vmovdqu xmm12,XMMWORD[80+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[96+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[112+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[128+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[144+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[160+rcx] + vaesenclast xmm5,xmm5,xmm12 + vaesenclast xmm6,xmm6,xmm12 + vaesenclast xmm7,xmm7,xmm12 + vaesenclast xmm8,xmm8,xmm12 + + + + vpxor xmm5,xmm5,XMMWORD[rdi] + vpxor xmm6,xmm6,XMMWORD[16+rdi] + vpxor xmm7,xmm7,XMMWORD[32+rdi] + vpxor xmm8,xmm8,XMMWORD[48+rdi] + + sub r8,1 + + vmovdqu XMMWORD[rsi],xmm5 + vmovdqu XMMWORD[16+rsi],xmm6 + vmovdqu XMMWORD[32+rsi],xmm7 + vmovdqu XMMWORD[48+rsi],xmm8 + + jne NEAR $L$128_enc_msg_x4_loop1 + + add rsi,64 + add rdi,64 + +$L$128_enc_msg_x4_check_remainder: + cmp r10,0 + je NEAR $L$128_enc_msg_x4_out + +$L$128_enc_msg_x4_loop2: + + + vmovdqa xmm5,xmm0 + vpaddd xmm0,xmm0,XMMWORD[one] + + vpxor xmm5,xmm5,XMMWORD[rcx] + vaesenc xmm5,xmm5,XMMWORD[16+rcx] + vaesenc xmm5,xmm5,XMMWORD[32+rcx] + vaesenc xmm5,xmm5,XMMWORD[48+rcx] + vaesenc xmm5,xmm5,XMMWORD[64+rcx] + vaesenc xmm5,xmm5,XMMWORD[80+rcx] + vaesenc xmm5,xmm5,XMMWORD[96+rcx] + vaesenc xmm5,xmm5,XMMWORD[112+rcx] + vaesenc xmm5,xmm5,XMMWORD[128+rcx] + vaesenc xmm5,xmm5,XMMWORD[144+rcx] + vaesenclast xmm5,xmm5,XMMWORD[160+rcx] + + + vpxor xmm5,xmm5,XMMWORD[rdi] + vmovdqu XMMWORD[rsi],xmm5 + + add rdi,16 + add rsi,16 + + sub r10,1 + jne NEAR $L$128_enc_msg_x4_loop2 + +$L$128_enc_msg_x4_out: + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes128gcmsiv_enc_msg_x4: +global aes128gcmsiv_enc_msg_x8 + +ALIGN 16 +aes128gcmsiv_enc_msg_x8: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_enc_msg_x8: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + test r8,r8 + jnz NEAR $L$128_enc_msg_x8_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$128_enc_msg_x8_start: + push r12 + + push r13 + + push rbp + + mov rbp,rsp + + + + sub rsp,128 + and rsp,-64 + + shr r8,4 + mov r10,r8 + shl r10,61 + shr r10,61 + + + vmovdqu xmm1,XMMWORD[rdx] + vpor xmm1,xmm1,XMMWORD[OR_MASK] + + + vpaddd xmm0,xmm1,XMMWORD[seven] + vmovdqu XMMWORD[rsp],xmm0 + vpaddd xmm9,xmm1,XMMWORD[one] + vpaddd xmm10,xmm1,XMMWORD[two] + vpaddd xmm11,xmm1,XMMWORD[three] + vpaddd xmm12,xmm1,XMMWORD[four] + vpaddd xmm13,xmm1,XMMWORD[five] + vpaddd xmm14,xmm1,XMMWORD[six] + vmovdqa xmm0,xmm1 + + shr r8,3 + je NEAR $L$128_enc_msg_x8_check_remainder + + sub rsi,128 + sub rdi,128 + +$L$128_enc_msg_x8_loop1: + add rsi,128 + add rdi,128 + + vmovdqa xmm1,xmm0 + vmovdqa xmm2,xmm9 + vmovdqa xmm3,xmm10 + vmovdqa xmm4,xmm11 + vmovdqa xmm5,xmm12 + vmovdqa xmm6,xmm13 + vmovdqa xmm7,xmm14 + + vmovdqu xmm8,XMMWORD[rsp] + + vpxor xmm1,xmm1,XMMWORD[rcx] + vpxor xmm2,xmm2,XMMWORD[rcx] + vpxor xmm3,xmm3,XMMWORD[rcx] + vpxor xmm4,xmm4,XMMWORD[rcx] + vpxor xmm5,xmm5,XMMWORD[rcx] + vpxor xmm6,xmm6,XMMWORD[rcx] + vpxor xmm7,xmm7,XMMWORD[rcx] + vpxor xmm8,xmm8,XMMWORD[rcx] + + vmovdqu xmm15,XMMWORD[16+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[rsp] + vpaddd xmm14,xmm14,XMMWORD[eight] + vmovdqu XMMWORD[rsp],xmm14 + vmovdqu xmm15,XMMWORD[32+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpsubd xmm14,xmm14,XMMWORD[one] + vmovdqu xmm15,XMMWORD[48+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm0,xmm0,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[64+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm9,xmm9,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[80+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm10,xmm10,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[96+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm11,xmm11,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[112+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm12,xmm12,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[128+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm13,xmm13,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[144+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[160+rcx] + vaesenclast xmm1,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm15 + vaesenclast xmm3,xmm3,xmm15 + vaesenclast xmm4,xmm4,xmm15 + vaesenclast xmm5,xmm5,xmm15 + vaesenclast xmm6,xmm6,xmm15 + vaesenclast xmm7,xmm7,xmm15 + vaesenclast xmm8,xmm8,xmm15 + + + + vpxor xmm1,xmm1,XMMWORD[rdi] + vpxor xmm2,xmm2,XMMWORD[16+rdi] + vpxor xmm3,xmm3,XMMWORD[32+rdi] + vpxor xmm4,xmm4,XMMWORD[48+rdi] + vpxor xmm5,xmm5,XMMWORD[64+rdi] + vpxor xmm6,xmm6,XMMWORD[80+rdi] + vpxor xmm7,xmm7,XMMWORD[96+rdi] + vpxor xmm8,xmm8,XMMWORD[112+rdi] + + dec r8 + + vmovdqu XMMWORD[rsi],xmm1 + vmovdqu XMMWORD[16+rsi],xmm2 + vmovdqu XMMWORD[32+rsi],xmm3 + vmovdqu XMMWORD[48+rsi],xmm4 + vmovdqu XMMWORD[64+rsi],xmm5 + vmovdqu XMMWORD[80+rsi],xmm6 + vmovdqu XMMWORD[96+rsi],xmm7 + vmovdqu XMMWORD[112+rsi],xmm8 + + jne NEAR $L$128_enc_msg_x8_loop1 + + add rsi,128 + add rdi,128 + +$L$128_enc_msg_x8_check_remainder: + cmp r10,0 + je NEAR $L$128_enc_msg_x8_out + +$L$128_enc_msg_x8_loop2: + + + vmovdqa xmm1,xmm0 + vpaddd xmm0,xmm0,XMMWORD[one] + + vpxor xmm1,xmm1,XMMWORD[rcx] + vaesenc xmm1,xmm1,XMMWORD[16+rcx] + vaesenc xmm1,xmm1,XMMWORD[32+rcx] + vaesenc xmm1,xmm1,XMMWORD[48+rcx] + vaesenc xmm1,xmm1,XMMWORD[64+rcx] + vaesenc xmm1,xmm1,XMMWORD[80+rcx] + vaesenc xmm1,xmm1,XMMWORD[96+rcx] + vaesenc xmm1,xmm1,XMMWORD[112+rcx] + vaesenc xmm1,xmm1,XMMWORD[128+rcx] + vaesenc xmm1,xmm1,XMMWORD[144+rcx] + vaesenclast xmm1,xmm1,XMMWORD[160+rcx] + + + vpxor xmm1,xmm1,XMMWORD[rdi] + + vmovdqu XMMWORD[rsi],xmm1 + + add rdi,16 + add rsi,16 + + dec r10 + jne NEAR $L$128_enc_msg_x8_loop2 + +$L$128_enc_msg_x8_out: + mov rsp,rbp + + pop rbp + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes128gcmsiv_enc_msg_x8: +global aes128gcmsiv_dec + +ALIGN 16 +aes128gcmsiv_dec: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_dec: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + test r9,~15 + jnz NEAR $L$128_dec_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$128_dec_start: + vzeroupper + vmovdqa xmm0,XMMWORD[rdx] + mov rax,rdx + + lea rax,[32+rax] + lea rcx,[32+rcx] + + + vmovdqu xmm15,XMMWORD[r9*1+rdi] + vpor xmm15,xmm15,XMMWORD[OR_MASK] + and r9,~15 + + + cmp r9,96 + jb NEAR $L$128_dec_loop2 + + + sub r9,96 + vmovdqa xmm7,xmm15 + vpaddd xmm8,xmm7,XMMWORD[one] + vpaddd xmm9,xmm7,XMMWORD[two] + vpaddd xmm10,xmm9,XMMWORD[one] + vpaddd xmm11,xmm9,XMMWORD[two] + vpaddd xmm12,xmm11,XMMWORD[one] + vpaddd xmm15,xmm11,XMMWORD[two] + + vpxor xmm7,xmm7,XMMWORD[r8] + vpxor xmm8,xmm8,XMMWORD[r8] + vpxor xmm9,xmm9,XMMWORD[r8] + vpxor xmm10,xmm10,XMMWORD[r8] + vpxor xmm11,xmm11,XMMWORD[r8] + vpxor xmm12,xmm12,XMMWORD[r8] + + vmovdqu xmm4,XMMWORD[16+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[32+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[48+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[64+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[80+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[96+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[112+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[128+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[144+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[160+r8] + vaesenclast xmm7,xmm7,xmm4 + vaesenclast xmm8,xmm8,xmm4 + vaesenclast xmm9,xmm9,xmm4 + vaesenclast xmm10,xmm10,xmm4 + vaesenclast xmm11,xmm11,xmm4 + vaesenclast xmm12,xmm12,xmm4 + + + vpxor xmm7,xmm7,XMMWORD[rdi] + vpxor xmm8,xmm8,XMMWORD[16+rdi] + vpxor xmm9,xmm9,XMMWORD[32+rdi] + vpxor xmm10,xmm10,XMMWORD[48+rdi] + vpxor xmm11,xmm11,XMMWORD[64+rdi] + vpxor xmm12,xmm12,XMMWORD[80+rdi] + + vmovdqu XMMWORD[rsi],xmm7 + vmovdqu XMMWORD[16+rsi],xmm8 + vmovdqu XMMWORD[32+rsi],xmm9 + vmovdqu XMMWORD[48+rsi],xmm10 + vmovdqu XMMWORD[64+rsi],xmm11 + vmovdqu XMMWORD[80+rsi],xmm12 + + add rdi,96 + add rsi,96 + jmp NEAR $L$128_dec_loop1 + + +ALIGN 64 +$L$128_dec_loop1: + cmp r9,96 + jb NEAR $L$128_dec_finish_96 + sub r9,96 + + vmovdqa xmm6,xmm12 + vmovdqa XMMWORD[(16-32)+rax],xmm11 + vmovdqa XMMWORD[(32-32)+rax],xmm10 + vmovdqa XMMWORD[(48-32)+rax],xmm9 + vmovdqa XMMWORD[(64-32)+rax],xmm8 + vmovdqa XMMWORD[(80-32)+rax],xmm7 + + vmovdqa xmm7,xmm15 + vpaddd xmm8,xmm7,XMMWORD[one] + vpaddd xmm9,xmm7,XMMWORD[two] + vpaddd xmm10,xmm9,XMMWORD[one] + vpaddd xmm11,xmm9,XMMWORD[two] + vpaddd xmm12,xmm11,XMMWORD[one] + vpaddd xmm15,xmm11,XMMWORD[two] + + vmovdqa xmm4,XMMWORD[r8] + vpxor xmm7,xmm7,xmm4 + vpxor xmm8,xmm8,xmm4 + vpxor xmm9,xmm9,xmm4 + vpxor xmm10,xmm10,xmm4 + vpxor xmm11,xmm11,xmm4 + vpxor xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[((0-32))+rcx] + vpclmulqdq xmm2,xmm6,xmm4,0x11 + vpclmulqdq xmm3,xmm6,xmm4,0x00 + vpclmulqdq xmm1,xmm6,xmm4,0x01 + vpclmulqdq xmm4,xmm6,xmm4,0x10 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm4,XMMWORD[16+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[((-16))+rax] + vmovdqu xmm13,XMMWORD[((-16))+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[32+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[rax] + vmovdqu xmm13,XMMWORD[rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[48+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[16+rax] + vmovdqu xmm13,XMMWORD[16+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[64+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[32+rax] + vmovdqu xmm13,XMMWORD[32+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[80+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[96+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[112+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + + vmovdqa xmm6,XMMWORD[((80-32))+rax] + vpxor xmm6,xmm6,xmm0 + vmovdqu xmm5,XMMWORD[((80-32))+rcx] + + vpclmulqdq xmm4,xmm6,xmm5,0x01 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x10 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm4,XMMWORD[128+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + + vpsrldq xmm4,xmm1,8 + vpxor xmm5,xmm2,xmm4 + vpslldq xmm4,xmm1,8 + vpxor xmm0,xmm3,xmm4 + + vmovdqa xmm3,XMMWORD[poly] + + vmovdqu xmm4,XMMWORD[144+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[160+r8] + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpxor xmm4,xmm6,XMMWORD[rdi] + vaesenclast xmm7,xmm7,xmm4 + vpxor xmm4,xmm6,XMMWORD[16+rdi] + vaesenclast xmm8,xmm8,xmm4 + vpxor xmm4,xmm6,XMMWORD[32+rdi] + vaesenclast xmm9,xmm9,xmm4 + vpxor xmm4,xmm6,XMMWORD[48+rdi] + vaesenclast xmm10,xmm10,xmm4 + vpxor xmm4,xmm6,XMMWORD[64+rdi] + vaesenclast xmm11,xmm11,xmm4 + vpxor xmm4,xmm6,XMMWORD[80+rdi] + vaesenclast xmm12,xmm12,xmm4 + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vmovdqu XMMWORD[rsi],xmm7 + vmovdqu XMMWORD[16+rsi],xmm8 + vmovdqu XMMWORD[32+rsi],xmm9 + vmovdqu XMMWORD[48+rsi],xmm10 + vmovdqu XMMWORD[64+rsi],xmm11 + vmovdqu XMMWORD[80+rsi],xmm12 + + vpxor xmm0,xmm0,xmm5 + + lea rdi,[96+rdi] + lea rsi,[96+rsi] + jmp NEAR $L$128_dec_loop1 + +$L$128_dec_finish_96: + vmovdqa xmm6,xmm12 + vmovdqa XMMWORD[(16-32)+rax],xmm11 + vmovdqa XMMWORD[(32-32)+rax],xmm10 + vmovdqa XMMWORD[(48-32)+rax],xmm9 + vmovdqa XMMWORD[(64-32)+rax],xmm8 + vmovdqa XMMWORD[(80-32)+rax],xmm7 + + vmovdqu xmm4,XMMWORD[((0-32))+rcx] + vpclmulqdq xmm1,xmm6,xmm4,0x10 + vpclmulqdq xmm2,xmm6,xmm4,0x11 + vpclmulqdq xmm3,xmm6,xmm4,0x00 + vpclmulqdq xmm4,xmm6,xmm4,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[((-16))+rax] + vmovdqu xmm13,XMMWORD[((-16))+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[rax] + vmovdqu xmm13,XMMWORD[rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[16+rax] + vmovdqu xmm13,XMMWORD[16+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[32+rax] + vmovdqu xmm13,XMMWORD[32+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm6,XMMWORD[((80-32))+rax] + vpxor xmm6,xmm6,xmm0 + vmovdqu xmm5,XMMWORD[((80-32))+rcx] + vpclmulqdq xmm4,xmm6,xmm5,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x01 + vpxor xmm1,xmm1,xmm4 + + vpsrldq xmm4,xmm1,8 + vpxor xmm5,xmm2,xmm4 + vpslldq xmm4,xmm1,8 + vpxor xmm0,xmm3,xmm4 + + vmovdqa xmm3,XMMWORD[poly] + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpxor xmm0,xmm0,xmm5 + +$L$128_dec_loop2: + + + + cmp r9,16 + jb NEAR $L$128_dec_out + sub r9,16 + + vmovdqa xmm2,xmm15 + vpaddd xmm15,xmm15,XMMWORD[one] + + vpxor xmm2,xmm2,XMMWORD[r8] + vaesenc xmm2,xmm2,XMMWORD[16+r8] + vaesenc xmm2,xmm2,XMMWORD[32+r8] + vaesenc xmm2,xmm2,XMMWORD[48+r8] + vaesenc xmm2,xmm2,XMMWORD[64+r8] + vaesenc xmm2,xmm2,XMMWORD[80+r8] + vaesenc xmm2,xmm2,XMMWORD[96+r8] + vaesenc xmm2,xmm2,XMMWORD[112+r8] + vaesenc xmm2,xmm2,XMMWORD[128+r8] + vaesenc xmm2,xmm2,XMMWORD[144+r8] + vaesenclast xmm2,xmm2,XMMWORD[160+r8] + vpxor xmm2,xmm2,XMMWORD[rdi] + vmovdqu XMMWORD[rsi],xmm2 + add rdi,16 + add rsi,16 + + vpxor xmm0,xmm0,xmm2 + vmovdqa xmm1,XMMWORD[((-32))+rcx] + call GFMUL + + jmp NEAR $L$128_dec_loop2 + +$L$128_dec_out: + vmovdqu XMMWORD[rdx],xmm0 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes128gcmsiv_dec: +global aes128gcmsiv_ecb_enc_block + +ALIGN 16 +aes128gcmsiv_ecb_enc_block: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_ecb_enc_block: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + vmovdqa xmm1,XMMWORD[rdi] + + vpxor xmm1,xmm1,XMMWORD[rdx] + vaesenc xmm1,xmm1,XMMWORD[16+rdx] + vaesenc xmm1,xmm1,XMMWORD[32+rdx] + vaesenc xmm1,xmm1,XMMWORD[48+rdx] + vaesenc xmm1,xmm1,XMMWORD[64+rdx] + vaesenc xmm1,xmm1,XMMWORD[80+rdx] + vaesenc xmm1,xmm1,XMMWORD[96+rdx] + vaesenc xmm1,xmm1,XMMWORD[112+rdx] + vaesenc xmm1,xmm1,XMMWORD[128+rdx] + vaesenc xmm1,xmm1,XMMWORD[144+rdx] + vaesenclast xmm1,xmm1,XMMWORD[160+rdx] + + vmovdqa XMMWORD[rsi],xmm1 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes128gcmsiv_ecb_enc_block: +global aes256gcmsiv_aes_ks_enc_x1 + +ALIGN 16 +aes256gcmsiv_aes_ks_enc_x1: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_aes_ks_enc_x1: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + + vmovdqa xmm0,XMMWORD[con1] + vmovdqa xmm15,XMMWORD[mask] + vmovdqa xmm8,XMMWORD[rdi] + vmovdqa xmm1,XMMWORD[rcx] + vmovdqa xmm3,XMMWORD[16+rcx] + vpxor xmm8,xmm8,xmm1 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[rdx],xmm1 + vmovdqu XMMWORD[16+rdx],xmm3 + vpxor xmm14,xmm14,xmm14 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[32+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[48+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[64+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[80+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[96+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[112+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[128+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[144+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[160+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[176+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[192+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[208+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenclast xmm8,xmm8,xmm1 + vmovdqu XMMWORD[224+rdx],xmm1 + + vmovdqa XMMWORD[rsi],xmm8 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes256gcmsiv_aes_ks_enc_x1: +global aes256gcmsiv_ecb_enc_block + +ALIGN 16 +aes256gcmsiv_ecb_enc_block: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_ecb_enc_block: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + vmovdqa xmm1,XMMWORD[rdi] + vpxor xmm1,xmm1,XMMWORD[rdx] + vaesenc xmm1,xmm1,XMMWORD[16+rdx] + vaesenc xmm1,xmm1,XMMWORD[32+rdx] + vaesenc xmm1,xmm1,XMMWORD[48+rdx] + vaesenc xmm1,xmm1,XMMWORD[64+rdx] + vaesenc xmm1,xmm1,XMMWORD[80+rdx] + vaesenc xmm1,xmm1,XMMWORD[96+rdx] + vaesenc xmm1,xmm1,XMMWORD[112+rdx] + vaesenc xmm1,xmm1,XMMWORD[128+rdx] + vaesenc xmm1,xmm1,XMMWORD[144+rdx] + vaesenc xmm1,xmm1,XMMWORD[160+rdx] + vaesenc xmm1,xmm1,XMMWORD[176+rdx] + vaesenc xmm1,xmm1,XMMWORD[192+rdx] + vaesenc xmm1,xmm1,XMMWORD[208+rdx] + vaesenclast xmm1,xmm1,XMMWORD[224+rdx] + vmovdqa XMMWORD[rsi],xmm1 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes256gcmsiv_ecb_enc_block: +global aes256gcmsiv_enc_msg_x4 + +ALIGN 16 +aes256gcmsiv_enc_msg_x4: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_enc_msg_x4: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + test r8,r8 + jnz NEAR $L$256_enc_msg_x4_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$256_enc_msg_x4_start: + mov r10,r8 + shr r8,4 + shl r10,60 + jz NEAR $L$256_enc_msg_x4_start2 + add r8,1 + +$L$256_enc_msg_x4_start2: + mov r10,r8 + shl r10,62 + shr r10,62 + + + vmovdqa xmm15,XMMWORD[rdx] + vpor xmm15,xmm15,XMMWORD[OR_MASK] + + vmovdqa xmm4,XMMWORD[four] + vmovdqa xmm0,xmm15 + vpaddd xmm1,xmm15,XMMWORD[one] + vpaddd xmm2,xmm15,XMMWORD[two] + vpaddd xmm3,xmm15,XMMWORD[three] + + shr r8,2 + je NEAR $L$256_enc_msg_x4_check_remainder + + sub rsi,64 + sub rdi,64 + +$L$256_enc_msg_x4_loop1: + add rsi,64 + add rdi,64 + + vmovdqa xmm5,xmm0 + vmovdqa xmm6,xmm1 + vmovdqa xmm7,xmm2 + vmovdqa xmm8,xmm3 + + vpxor xmm5,xmm5,XMMWORD[rcx] + vpxor xmm6,xmm6,XMMWORD[rcx] + vpxor xmm7,xmm7,XMMWORD[rcx] + vpxor xmm8,xmm8,XMMWORD[rcx] + + vmovdqu xmm12,XMMWORD[16+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm0,xmm0,xmm4 + vmovdqu xmm12,XMMWORD[32+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm1,xmm1,xmm4 + vmovdqu xmm12,XMMWORD[48+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm2,xmm2,xmm4 + vmovdqu xmm12,XMMWORD[64+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm3,xmm3,xmm4 + + vmovdqu xmm12,XMMWORD[80+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[96+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[112+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[128+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[144+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[160+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[176+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[192+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[208+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[224+rcx] + vaesenclast xmm5,xmm5,xmm12 + vaesenclast xmm6,xmm6,xmm12 + vaesenclast xmm7,xmm7,xmm12 + vaesenclast xmm8,xmm8,xmm12 + + + + vpxor xmm5,xmm5,XMMWORD[rdi] + vpxor xmm6,xmm6,XMMWORD[16+rdi] + vpxor xmm7,xmm7,XMMWORD[32+rdi] + vpxor xmm8,xmm8,XMMWORD[48+rdi] + + sub r8,1 + + vmovdqu XMMWORD[rsi],xmm5 + vmovdqu XMMWORD[16+rsi],xmm6 + vmovdqu XMMWORD[32+rsi],xmm7 + vmovdqu XMMWORD[48+rsi],xmm8 + + jne NEAR $L$256_enc_msg_x4_loop1 + + add rsi,64 + add rdi,64 + +$L$256_enc_msg_x4_check_remainder: + cmp r10,0 + je NEAR $L$256_enc_msg_x4_out + +$L$256_enc_msg_x4_loop2: + + + + vmovdqa xmm5,xmm0 + vpaddd xmm0,xmm0,XMMWORD[one] + vpxor xmm5,xmm5,XMMWORD[rcx] + vaesenc xmm5,xmm5,XMMWORD[16+rcx] + vaesenc xmm5,xmm5,XMMWORD[32+rcx] + vaesenc xmm5,xmm5,XMMWORD[48+rcx] + vaesenc xmm5,xmm5,XMMWORD[64+rcx] + vaesenc xmm5,xmm5,XMMWORD[80+rcx] + vaesenc xmm5,xmm5,XMMWORD[96+rcx] + vaesenc xmm5,xmm5,XMMWORD[112+rcx] + vaesenc xmm5,xmm5,XMMWORD[128+rcx] + vaesenc xmm5,xmm5,XMMWORD[144+rcx] + vaesenc xmm5,xmm5,XMMWORD[160+rcx] + vaesenc xmm5,xmm5,XMMWORD[176+rcx] + vaesenc xmm5,xmm5,XMMWORD[192+rcx] + vaesenc xmm5,xmm5,XMMWORD[208+rcx] + vaesenclast xmm5,xmm5,XMMWORD[224+rcx] + + + vpxor xmm5,xmm5,XMMWORD[rdi] + + vmovdqu XMMWORD[rsi],xmm5 + + add rdi,16 + add rsi,16 + + sub r10,1 + jne NEAR $L$256_enc_msg_x4_loop2 + +$L$256_enc_msg_x4_out: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes256gcmsiv_enc_msg_x4: +global aes256gcmsiv_enc_msg_x8 + +ALIGN 16 +aes256gcmsiv_enc_msg_x8: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_enc_msg_x8: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + test r8,r8 + jnz NEAR $L$256_enc_msg_x8_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$256_enc_msg_x8_start: + + mov r11,rsp + sub r11,16 + and r11,-64 + + mov r10,r8 + shr r8,4 + shl r10,60 + jz NEAR $L$256_enc_msg_x8_start2 + add r8,1 + +$L$256_enc_msg_x8_start2: + mov r10,r8 + shl r10,61 + shr r10,61 + + + vmovdqa xmm1,XMMWORD[rdx] + vpor xmm1,xmm1,XMMWORD[OR_MASK] + + + vpaddd xmm0,xmm1,XMMWORD[seven] + vmovdqa XMMWORD[r11],xmm0 + vpaddd xmm9,xmm1,XMMWORD[one] + vpaddd xmm10,xmm1,XMMWORD[two] + vpaddd xmm11,xmm1,XMMWORD[three] + vpaddd xmm12,xmm1,XMMWORD[four] + vpaddd xmm13,xmm1,XMMWORD[five] + vpaddd xmm14,xmm1,XMMWORD[six] + vmovdqa xmm0,xmm1 + + shr r8,3 + jz NEAR $L$256_enc_msg_x8_check_remainder + + sub rsi,128 + sub rdi,128 + +$L$256_enc_msg_x8_loop1: + add rsi,128 + add rdi,128 + + vmovdqa xmm1,xmm0 + vmovdqa xmm2,xmm9 + vmovdqa xmm3,xmm10 + vmovdqa xmm4,xmm11 + vmovdqa xmm5,xmm12 + vmovdqa xmm6,xmm13 + vmovdqa xmm7,xmm14 + + vmovdqa xmm8,XMMWORD[r11] + + vpxor xmm1,xmm1,XMMWORD[rcx] + vpxor xmm2,xmm2,XMMWORD[rcx] + vpxor xmm3,xmm3,XMMWORD[rcx] + vpxor xmm4,xmm4,XMMWORD[rcx] + vpxor xmm5,xmm5,XMMWORD[rcx] + vpxor xmm6,xmm6,XMMWORD[rcx] + vpxor xmm7,xmm7,XMMWORD[rcx] + vpxor xmm8,xmm8,XMMWORD[rcx] + + vmovdqu xmm15,XMMWORD[16+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqa xmm14,XMMWORD[r11] + vpaddd xmm14,xmm14,XMMWORD[eight] + vmovdqa XMMWORD[r11],xmm14 + vmovdqu xmm15,XMMWORD[32+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpsubd xmm14,xmm14,XMMWORD[one] + vmovdqu xmm15,XMMWORD[48+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm0,xmm0,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[64+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm9,xmm9,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[80+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm10,xmm10,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[96+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm11,xmm11,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[112+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm12,xmm12,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[128+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm13,xmm13,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[144+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[160+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[176+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[192+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[208+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[224+rcx] + vaesenclast xmm1,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm15 + vaesenclast xmm3,xmm3,xmm15 + vaesenclast xmm4,xmm4,xmm15 + vaesenclast xmm5,xmm5,xmm15 + vaesenclast xmm6,xmm6,xmm15 + vaesenclast xmm7,xmm7,xmm15 + vaesenclast xmm8,xmm8,xmm15 + + + + vpxor xmm1,xmm1,XMMWORD[rdi] + vpxor xmm2,xmm2,XMMWORD[16+rdi] + vpxor xmm3,xmm3,XMMWORD[32+rdi] + vpxor xmm4,xmm4,XMMWORD[48+rdi] + vpxor xmm5,xmm5,XMMWORD[64+rdi] + vpxor xmm6,xmm6,XMMWORD[80+rdi] + vpxor xmm7,xmm7,XMMWORD[96+rdi] + vpxor xmm8,xmm8,XMMWORD[112+rdi] + + sub r8,1 + + vmovdqu XMMWORD[rsi],xmm1 + vmovdqu XMMWORD[16+rsi],xmm2 + vmovdqu XMMWORD[32+rsi],xmm3 + vmovdqu XMMWORD[48+rsi],xmm4 + vmovdqu XMMWORD[64+rsi],xmm5 + vmovdqu XMMWORD[80+rsi],xmm6 + vmovdqu XMMWORD[96+rsi],xmm7 + vmovdqu XMMWORD[112+rsi],xmm8 + + jne NEAR $L$256_enc_msg_x8_loop1 + + add rsi,128 + add rdi,128 + +$L$256_enc_msg_x8_check_remainder: + cmp r10,0 + je NEAR $L$256_enc_msg_x8_out + +$L$256_enc_msg_x8_loop2: + + + vmovdqa xmm1,xmm0 + vpaddd xmm0,xmm0,XMMWORD[one] + + vpxor xmm1,xmm1,XMMWORD[rcx] + vaesenc xmm1,xmm1,XMMWORD[16+rcx] + vaesenc xmm1,xmm1,XMMWORD[32+rcx] + vaesenc xmm1,xmm1,XMMWORD[48+rcx] + vaesenc xmm1,xmm1,XMMWORD[64+rcx] + vaesenc xmm1,xmm1,XMMWORD[80+rcx] + vaesenc xmm1,xmm1,XMMWORD[96+rcx] + vaesenc xmm1,xmm1,XMMWORD[112+rcx] + vaesenc xmm1,xmm1,XMMWORD[128+rcx] + vaesenc xmm1,xmm1,XMMWORD[144+rcx] + vaesenc xmm1,xmm1,XMMWORD[160+rcx] + vaesenc xmm1,xmm1,XMMWORD[176+rcx] + vaesenc xmm1,xmm1,XMMWORD[192+rcx] + vaesenc xmm1,xmm1,XMMWORD[208+rcx] + vaesenclast xmm1,xmm1,XMMWORD[224+rcx] + + + vpxor xmm1,xmm1,XMMWORD[rdi] + + vmovdqu XMMWORD[rsi],xmm1 + + add rdi,16 + add rsi,16 + sub r10,1 + jnz NEAR $L$256_enc_msg_x8_loop2 + +$L$256_enc_msg_x8_out: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + + +$L$SEH_end_aes256gcmsiv_enc_msg_x8: +global aes256gcmsiv_dec + +ALIGN 16 +aes256gcmsiv_dec: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_dec: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + test r9,~15 + jnz NEAR $L$256_dec_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$256_dec_start: + vzeroupper + vmovdqa xmm0,XMMWORD[rdx] + mov rax,rdx + + lea rax,[32+rax] + lea rcx,[32+rcx] + + + vmovdqu xmm15,XMMWORD[r9*1+rdi] + vpor xmm15,xmm15,XMMWORD[OR_MASK] + and r9,~15 + + + cmp r9,96 + jb NEAR $L$256_dec_loop2 + + + sub r9,96 + vmovdqa xmm7,xmm15 + vpaddd xmm8,xmm7,XMMWORD[one] + vpaddd xmm9,xmm7,XMMWORD[two] + vpaddd xmm10,xmm9,XMMWORD[one] + vpaddd xmm11,xmm9,XMMWORD[two] + vpaddd xmm12,xmm11,XMMWORD[one] + vpaddd xmm15,xmm11,XMMWORD[two] + + vpxor xmm7,xmm7,XMMWORD[r8] + vpxor xmm8,xmm8,XMMWORD[r8] + vpxor xmm9,xmm9,XMMWORD[r8] + vpxor xmm10,xmm10,XMMWORD[r8] + vpxor xmm11,xmm11,XMMWORD[r8] + vpxor xmm12,xmm12,XMMWORD[r8] + + vmovdqu xmm4,XMMWORD[16+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[32+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[48+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[64+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[80+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[96+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[112+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[128+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[144+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[160+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[176+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[192+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[208+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[224+r8] + vaesenclast xmm7,xmm7,xmm4 + vaesenclast xmm8,xmm8,xmm4 + vaesenclast xmm9,xmm9,xmm4 + vaesenclast xmm10,xmm10,xmm4 + vaesenclast xmm11,xmm11,xmm4 + vaesenclast xmm12,xmm12,xmm4 + + + vpxor xmm7,xmm7,XMMWORD[rdi] + vpxor xmm8,xmm8,XMMWORD[16+rdi] + vpxor xmm9,xmm9,XMMWORD[32+rdi] + vpxor xmm10,xmm10,XMMWORD[48+rdi] + vpxor xmm11,xmm11,XMMWORD[64+rdi] + vpxor xmm12,xmm12,XMMWORD[80+rdi] + + vmovdqu XMMWORD[rsi],xmm7 + vmovdqu XMMWORD[16+rsi],xmm8 + vmovdqu XMMWORD[32+rsi],xmm9 + vmovdqu XMMWORD[48+rsi],xmm10 + vmovdqu XMMWORD[64+rsi],xmm11 + vmovdqu XMMWORD[80+rsi],xmm12 + + add rdi,96 + add rsi,96 + jmp NEAR $L$256_dec_loop1 + + +ALIGN 64 +$L$256_dec_loop1: + cmp r9,96 + jb NEAR $L$256_dec_finish_96 + sub r9,96 + + vmovdqa xmm6,xmm12 + vmovdqa XMMWORD[(16-32)+rax],xmm11 + vmovdqa XMMWORD[(32-32)+rax],xmm10 + vmovdqa XMMWORD[(48-32)+rax],xmm9 + vmovdqa XMMWORD[(64-32)+rax],xmm8 + vmovdqa XMMWORD[(80-32)+rax],xmm7 + + vmovdqa xmm7,xmm15 + vpaddd xmm8,xmm7,XMMWORD[one] + vpaddd xmm9,xmm7,XMMWORD[two] + vpaddd xmm10,xmm9,XMMWORD[one] + vpaddd xmm11,xmm9,XMMWORD[two] + vpaddd xmm12,xmm11,XMMWORD[one] + vpaddd xmm15,xmm11,XMMWORD[two] + + vmovdqa xmm4,XMMWORD[r8] + vpxor xmm7,xmm7,xmm4 + vpxor xmm8,xmm8,xmm4 + vpxor xmm9,xmm9,xmm4 + vpxor xmm10,xmm10,xmm4 + vpxor xmm11,xmm11,xmm4 + vpxor xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[((0-32))+rcx] + vpclmulqdq xmm2,xmm6,xmm4,0x11 + vpclmulqdq xmm3,xmm6,xmm4,0x00 + vpclmulqdq xmm1,xmm6,xmm4,0x01 + vpclmulqdq xmm4,xmm6,xmm4,0x10 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm4,XMMWORD[16+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[((-16))+rax] + vmovdqu xmm13,XMMWORD[((-16))+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[32+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[rax] + vmovdqu xmm13,XMMWORD[rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[48+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[16+rax] + vmovdqu xmm13,XMMWORD[16+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[64+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[32+rax] + vmovdqu xmm13,XMMWORD[32+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[80+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[96+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[112+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + + vmovdqa xmm6,XMMWORD[((80-32))+rax] + vpxor xmm6,xmm6,xmm0 + vmovdqu xmm5,XMMWORD[((80-32))+rcx] + + vpclmulqdq xmm4,xmm6,xmm5,0x01 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x10 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm4,XMMWORD[128+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + + vpsrldq xmm4,xmm1,8 + vpxor xmm5,xmm2,xmm4 + vpslldq xmm4,xmm1,8 + vpxor xmm0,xmm3,xmm4 + + vmovdqa xmm3,XMMWORD[poly] + + vmovdqu xmm4,XMMWORD[144+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[160+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[176+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[192+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[208+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[224+r8] + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpxor xmm4,xmm6,XMMWORD[rdi] + vaesenclast xmm7,xmm7,xmm4 + vpxor xmm4,xmm6,XMMWORD[16+rdi] + vaesenclast xmm8,xmm8,xmm4 + vpxor xmm4,xmm6,XMMWORD[32+rdi] + vaesenclast xmm9,xmm9,xmm4 + vpxor xmm4,xmm6,XMMWORD[48+rdi] + vaesenclast xmm10,xmm10,xmm4 + vpxor xmm4,xmm6,XMMWORD[64+rdi] + vaesenclast xmm11,xmm11,xmm4 + vpxor xmm4,xmm6,XMMWORD[80+rdi] + vaesenclast xmm12,xmm12,xmm4 + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vmovdqu XMMWORD[rsi],xmm7 + vmovdqu XMMWORD[16+rsi],xmm8 + vmovdqu XMMWORD[32+rsi],xmm9 + vmovdqu XMMWORD[48+rsi],xmm10 + vmovdqu XMMWORD[64+rsi],xmm11 + vmovdqu XMMWORD[80+rsi],xmm12 + + vpxor xmm0,xmm0,xmm5 + + lea rdi,[96+rdi] + lea rsi,[96+rsi] + jmp NEAR $L$256_dec_loop1 + +$L$256_dec_finish_96: + vmovdqa xmm6,xmm12 + vmovdqa XMMWORD[(16-32)+rax],xmm11 + vmovdqa XMMWORD[(32-32)+rax],xmm10 + vmovdqa XMMWORD[(48-32)+rax],xmm9 + vmovdqa XMMWORD[(64-32)+rax],xmm8 + vmovdqa XMMWORD[(80-32)+rax],xmm7 + + vmovdqu xmm4,XMMWORD[((0-32))+rcx] + vpclmulqdq xmm1,xmm6,xmm4,0x10 + vpclmulqdq xmm2,xmm6,xmm4,0x11 + vpclmulqdq xmm3,xmm6,xmm4,0x00 + vpclmulqdq xmm4,xmm6,xmm4,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[((-16))+rax] + vmovdqu xmm13,XMMWORD[((-16))+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[rax] + vmovdqu xmm13,XMMWORD[rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[16+rax] + vmovdqu xmm13,XMMWORD[16+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[32+rax] + vmovdqu xmm13,XMMWORD[32+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm6,XMMWORD[((80-32))+rax] + vpxor xmm6,xmm6,xmm0 + vmovdqu xmm5,XMMWORD[((80-32))+rcx] + vpclmulqdq xmm4,xmm6,xmm5,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x01 + vpxor xmm1,xmm1,xmm4 + + vpsrldq xmm4,xmm1,8 + vpxor xmm5,xmm2,xmm4 + vpslldq xmm4,xmm1,8 + vpxor xmm0,xmm3,xmm4 + + vmovdqa xmm3,XMMWORD[poly] + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpxor xmm0,xmm0,xmm5 + +$L$256_dec_loop2: + + + + cmp r9,16 + jb NEAR $L$256_dec_out + sub r9,16 + + vmovdqa xmm2,xmm15 + vpaddd xmm15,xmm15,XMMWORD[one] + + vpxor xmm2,xmm2,XMMWORD[r8] + vaesenc xmm2,xmm2,XMMWORD[16+r8] + vaesenc xmm2,xmm2,XMMWORD[32+r8] + vaesenc xmm2,xmm2,XMMWORD[48+r8] + vaesenc xmm2,xmm2,XMMWORD[64+r8] + vaesenc xmm2,xmm2,XMMWORD[80+r8] + vaesenc xmm2,xmm2,XMMWORD[96+r8] + vaesenc xmm2,xmm2,XMMWORD[112+r8] + vaesenc xmm2,xmm2,XMMWORD[128+r8] + vaesenc xmm2,xmm2,XMMWORD[144+r8] + vaesenc xmm2,xmm2,XMMWORD[160+r8] + vaesenc xmm2,xmm2,XMMWORD[176+r8] + vaesenc xmm2,xmm2,XMMWORD[192+r8] + vaesenc xmm2,xmm2,XMMWORD[208+r8] + vaesenclast xmm2,xmm2,XMMWORD[224+r8] + vpxor xmm2,xmm2,XMMWORD[rdi] + vmovdqu XMMWORD[rsi],xmm2 + add rdi,16 + add rsi,16 + + vpxor xmm0,xmm0,xmm2 + vmovdqa xmm1,XMMWORD[((-32))+rcx] + call GFMUL + + jmp NEAR $L$256_dec_loop2 + +$L$256_dec_out: + vmovdqu XMMWORD[rdx],xmm0 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes256gcmsiv_dec: +global aes256gcmsiv_kdf + +ALIGN 16 +aes256gcmsiv_kdf: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_kdf: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + + + + + + vmovdqa xmm1,XMMWORD[rdx] + vmovdqa xmm4,XMMWORD[rdi] + vmovdqa xmm11,XMMWORD[and_mask] + vmovdqa xmm8,XMMWORD[one] + vpshufd xmm4,xmm4,0x90 + vpand xmm4,xmm4,xmm11 + vpaddd xmm6,xmm4,xmm8 + vpaddd xmm7,xmm6,xmm8 + vpaddd xmm11,xmm7,xmm8 + vpaddd xmm12,xmm11,xmm8 + vpaddd xmm13,xmm12,xmm8 + + vpxor xmm4,xmm4,xmm1 + vpxor xmm6,xmm6,xmm1 + vpxor xmm7,xmm7,xmm1 + vpxor xmm11,xmm11,xmm1 + vpxor xmm12,xmm12,xmm1 + vpxor xmm13,xmm13,xmm1 + + vmovdqa xmm1,XMMWORD[16+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[32+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[48+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[64+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[80+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[96+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[112+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[128+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[144+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[160+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[176+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[192+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[208+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[224+rdx] + vaesenclast xmm4,xmm4,xmm2 + vaesenclast xmm6,xmm6,xmm2 + vaesenclast xmm7,xmm7,xmm2 + vaesenclast xmm11,xmm11,xmm2 + vaesenclast xmm12,xmm12,xmm2 + vaesenclast xmm13,xmm13,xmm2 + + + vmovdqa XMMWORD[rsi],xmm4 + vmovdqa XMMWORD[16+rsi],xmm6 + vmovdqa XMMWORD[32+rsi],xmm7 + vmovdqa XMMWORD[48+rsi],xmm11 + vmovdqa XMMWORD[64+rsi],xmm12 + vmovdqa XMMWORD[80+rsi],xmm13 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aes256gcmsiv_kdf: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm new file mode 100644 index 0000000000..ab8cf92b72 --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm @@ -0,0 +1,8 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +global dummy_chacha20_poly1305_asm + +dummy_chacha20_poly1305_asm: + DB 0F3h,0C3h ;repret diff --git a/packager/third_party/boringssl/win-x86_64/crypto/aes/aes-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm similarity index 99% rename from packager/third_party/boringssl/win-x86_64/crypto/aes/aes-x86_64.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm index 53394f0e22..f6a4edfa0f 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/aes/aes-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aes-x86_64.asm @@ -344,6 +344,7 @@ $L$SEH_begin_asm_AES_encrypt: mov rdx,r8 + mov rax,rsp push rbx push rbp push r12 @@ -352,7 +353,6 @@ $L$SEH_begin_asm_AES_encrypt: push r15 - mov r10,rsp lea rcx,[((-63))+rdx] and rsp,-64 sub rcx,rsp @@ -362,7 +362,7 @@ $L$SEH_begin_asm_AES_encrypt: sub rsp,32 mov QWORD[16+rsp],rsi - mov QWORD[24+rsp],r10 + mov QWORD[24+rsp],rax $L$enc_prologue: mov r15,rdx @@ -394,13 +394,13 @@ $L$enc_prologue: mov DWORD[8+r9],ecx mov DWORD[12+r9],edx - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -800,6 +800,7 @@ $L$SEH_begin_asm_AES_decrypt: mov rdx,r8 + mov rax,rsp push rbx push rbp push r12 @@ -808,7 +809,6 @@ $L$SEH_begin_asm_AES_decrypt: push r15 - mov r10,rsp lea rcx,[((-63))+rdx] and rsp,-64 sub rcx,rsp @@ -818,7 +818,7 @@ $L$SEH_begin_asm_AES_decrypt: sub rsp,32 mov QWORD[16+rsp],rsi - mov QWORD[24+rsp],r10 + mov QWORD[24+rsp],rax $L$dec_prologue: mov r15,rdx @@ -852,13 +852,13 @@ $L$dec_prologue: mov DWORD[8+r9],ecx mov DWORD[12+r9],edx - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -1367,12 +1367,12 @@ $L$cbc_prologue: mov r9d,r9d lea r14,[$L$AES_Te] + lea r10,[$L$AES_Td] cmp r9,0 - jne NEAR $L$cbc_picked_te - lea r14,[$L$AES_Td] -$L$cbc_picked_te: + cmove r14,r10 - mov r10d,DWORD[OPENSSL_ia32cap_P] + lea r10,[OPENSSL_ia32cap_P] + mov r10d,DWORD[r10] cmp rdx,512 jb NEAR $L$cbc_slow_prologue test rdx,15 @@ -2626,7 +2626,6 @@ block_se_handler: jae NEAR $L$in_block_prologue mov rax,QWORD[24+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm new file mode 100644 index 0000000000..63bcd48dcb --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm @@ -0,0 +1,1022 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + + +ALIGN 32 +_aesni_ctr32_ghash_6x: + + vmovdqu xmm2,XMMWORD[32+r11] + sub rdx,6 + vpxor xmm4,xmm4,xmm4 + vmovdqu xmm15,XMMWORD[((0-128))+rcx] + vpaddb xmm10,xmm1,xmm2 + vpaddb xmm11,xmm10,xmm2 + vpaddb xmm12,xmm11,xmm2 + vpaddb xmm13,xmm12,xmm2 + vpaddb xmm14,xmm13,xmm2 + vpxor xmm9,xmm1,xmm15 + vmovdqu XMMWORD[(16+8)+rsp],xmm4 + jmp NEAR $L$oop6x + +ALIGN 32 +$L$oop6x: + add ebx,100663296 + jc NEAR $L$handle_ctr32 + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpaddb xmm1,xmm14,xmm2 + vpxor xmm10,xmm10,xmm15 + vpxor xmm11,xmm11,xmm15 + +$L$resume_ctr32: + vmovdqu XMMWORD[r8],xmm1 + vpclmulqdq xmm5,xmm7,xmm3,0x10 + vpxor xmm12,xmm12,xmm15 + vmovups xmm2,XMMWORD[((16-128))+rcx] + vpclmulqdq xmm6,xmm7,xmm3,0x01 + + + + + + + + + + + + + + + + + + xor r12,r12 + cmp r15,r14 + + vaesenc xmm9,xmm9,xmm2 + vmovdqu xmm0,XMMWORD[((48+8))+rsp] + vpxor xmm13,xmm13,xmm15 + vpclmulqdq xmm1,xmm7,xmm3,0x00 + vaesenc xmm10,xmm10,xmm2 + vpxor xmm14,xmm14,xmm15 + setnc r12b + vpclmulqdq xmm7,xmm7,xmm3,0x11 + vaesenc xmm11,xmm11,xmm2 + vmovdqu xmm3,XMMWORD[((16-32))+r9] + neg r12 + vaesenc xmm12,xmm12,xmm2 + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm0,xmm3,0x00 + vpxor xmm8,xmm8,xmm4 + vaesenc xmm13,xmm13,xmm2 + vpxor xmm4,xmm1,xmm5 + and r12,0x60 + vmovups xmm15,XMMWORD[((32-128))+rcx] + vpclmulqdq xmm1,xmm0,xmm3,0x10 + vaesenc xmm14,xmm14,xmm2 + + vpclmulqdq xmm2,xmm0,xmm3,0x01 + lea r14,[r12*1+r14] + vaesenc xmm9,xmm9,xmm15 + vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] + vpclmulqdq xmm3,xmm0,xmm3,0x11 + vmovdqu xmm0,XMMWORD[((64+8))+rsp] + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[88+r14] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[80+r14] + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((32+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((40+8))+rsp],r12 + vmovdqu xmm5,XMMWORD[((48-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((48-128))+rcx] + vpxor xmm6,xmm6,xmm1 + vpclmulqdq xmm1,xmm0,xmm5,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm2 + vpclmulqdq xmm2,xmm0,xmm5,0x10 + vaesenc xmm10,xmm10,xmm15 + vpxor xmm7,xmm7,xmm3 + vpclmulqdq xmm3,xmm0,xmm5,0x01 + vaesenc xmm11,xmm11,xmm15 + vpclmulqdq xmm5,xmm0,xmm5,0x11 + vmovdqu xmm0,XMMWORD[((80+8))+rsp] + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vpxor xmm4,xmm4,xmm1 + vmovdqu xmm1,XMMWORD[((64-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((64-128))+rcx] + vpxor xmm6,xmm6,xmm2 + vpclmulqdq xmm2,xmm0,xmm1,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm3 + vpclmulqdq xmm3,xmm0,xmm1,0x10 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[72+r14] + vpxor xmm7,xmm7,xmm5 + vpclmulqdq xmm5,xmm0,xmm1,0x01 + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[64+r14] + vpclmulqdq xmm1,xmm0,xmm1,0x11 + vmovdqu xmm0,XMMWORD[((96+8))+rsp] + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((48+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((56+8))+rsp],r12 + vpxor xmm4,xmm4,xmm2 + vmovdqu xmm2,XMMWORD[((96-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((80-128))+rcx] + vpxor xmm6,xmm6,xmm3 + vpclmulqdq xmm3,xmm0,xmm2,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm0,xmm2,0x10 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[56+r14] + vpxor xmm7,xmm7,xmm1 + vpclmulqdq xmm1,xmm0,xmm2,0x01 + vpxor xmm8,xmm8,XMMWORD[((112+8))+rsp] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[48+r14] + vpclmulqdq xmm2,xmm0,xmm2,0x11 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((64+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((72+8))+rsp],r12 + vpxor xmm4,xmm4,xmm3 + vmovdqu xmm3,XMMWORD[((112-32))+r9] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((96-128))+rcx] + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm8,xmm3,0x10 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm1 + vpclmulqdq xmm1,xmm8,xmm3,0x01 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[40+r14] + vpxor xmm7,xmm7,xmm2 + vpclmulqdq xmm2,xmm8,xmm3,0x00 + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[32+r14] + vpclmulqdq xmm8,xmm8,xmm3,0x11 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((80+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((88+8))+rsp],r12 + vpxor xmm6,xmm6,xmm5 + vaesenc xmm14,xmm14,xmm15 + vpxor xmm6,xmm6,xmm1 + + vmovups xmm15,XMMWORD[((112-128))+rcx] + vpslldq xmm5,xmm6,8 + vpxor xmm4,xmm4,xmm2 + vmovdqu xmm3,XMMWORD[16+r11] + + vaesenc xmm9,xmm9,xmm15 + vpxor xmm7,xmm7,xmm8 + vaesenc xmm10,xmm10,xmm15 + vpxor xmm4,xmm4,xmm5 + movbe r13,QWORD[24+r14] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[16+r14] + vpalignr xmm0,xmm4,xmm4,8 + vpclmulqdq xmm4,xmm4,xmm3,0x10 + mov QWORD[((96+8))+rsp],r13 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((104+8))+rsp],r12 + vaesenc xmm13,xmm13,xmm15 + vmovups xmm1,XMMWORD[((128-128))+rcx] + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vmovups xmm15,XMMWORD[((144-128))+rcx] + vaesenc xmm10,xmm10,xmm1 + vpsrldq xmm6,xmm6,8 + vaesenc xmm11,xmm11,xmm1 + vpxor xmm7,xmm7,xmm6 + vaesenc xmm12,xmm12,xmm1 + vpxor xmm4,xmm4,xmm0 + movbe r13,QWORD[8+r14] + vaesenc xmm13,xmm13,xmm1 + movbe r12,QWORD[r14] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((160-128))+rcx] + cmp ebp,11 + jb NEAR $L$enc_tail + + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + vmovups xmm15,XMMWORD[((176-128))+rcx] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((192-128))+rcx] + je NEAR $L$enc_tail + + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + vmovups xmm15,XMMWORD[((208-128))+rcx] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((224-128))+rcx] + jmp NEAR $L$enc_tail + +ALIGN 32 +$L$handle_ctr32: + vmovdqu xmm0,XMMWORD[r11] + vpshufb xmm6,xmm1,xmm0 + vmovdqu xmm5,XMMWORD[48+r11] + vpaddd xmm10,xmm6,XMMWORD[64+r11] + vpaddd xmm11,xmm6,xmm5 + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpaddd xmm12,xmm10,xmm5 + vpshufb xmm10,xmm10,xmm0 + vpaddd xmm13,xmm11,xmm5 + vpshufb xmm11,xmm11,xmm0 + vpxor xmm10,xmm10,xmm15 + vpaddd xmm14,xmm12,xmm5 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm11,xmm11,xmm15 + vpaddd xmm1,xmm13,xmm5 + vpshufb xmm13,xmm13,xmm0 + vpshufb xmm14,xmm14,xmm0 + vpshufb xmm1,xmm1,xmm0 + jmp NEAR $L$resume_ctr32 + +ALIGN 32 +$L$enc_tail: + vaesenc xmm9,xmm9,xmm15 + vmovdqu XMMWORD[(16+8)+rsp],xmm7 + vpalignr xmm8,xmm4,xmm4,8 + vaesenc xmm10,xmm10,xmm15 + vpclmulqdq xmm4,xmm4,xmm3,0x10 + vpxor xmm2,xmm1,XMMWORD[rdi] + vaesenc xmm11,xmm11,xmm15 + vpxor xmm0,xmm1,XMMWORD[16+rdi] + vaesenc xmm12,xmm12,xmm15 + vpxor xmm5,xmm1,XMMWORD[32+rdi] + vaesenc xmm13,xmm13,xmm15 + vpxor xmm6,xmm1,XMMWORD[48+rdi] + vaesenc xmm14,xmm14,xmm15 + vpxor xmm7,xmm1,XMMWORD[64+rdi] + vpxor xmm3,xmm1,XMMWORD[80+rdi] + vmovdqu xmm1,XMMWORD[r8] + + vaesenclast xmm9,xmm9,xmm2 + vmovdqu xmm2,XMMWORD[32+r11] + vaesenclast xmm10,xmm10,xmm0 + vpaddb xmm0,xmm1,xmm2 + mov QWORD[((112+8))+rsp],r13 + lea rdi,[96+rdi] + vaesenclast xmm11,xmm11,xmm5 + vpaddb xmm5,xmm0,xmm2 + mov QWORD[((120+8))+rsp],r12 + lea rsi,[96+rsi] + vmovdqu xmm15,XMMWORD[((0-128))+rcx] + vaesenclast xmm12,xmm12,xmm6 + vpaddb xmm6,xmm5,xmm2 + vaesenclast xmm13,xmm13,xmm7 + vpaddb xmm7,xmm6,xmm2 + vaesenclast xmm14,xmm14,xmm3 + vpaddb xmm3,xmm7,xmm2 + + add r10,0x60 + sub rdx,0x6 + jc NEAR $L$6x_done + + vmovups XMMWORD[(-96)+rsi],xmm9 + vpxor xmm9,xmm1,xmm15 + vmovups XMMWORD[(-80)+rsi],xmm10 + vmovdqa xmm10,xmm0 + vmovups XMMWORD[(-64)+rsi],xmm11 + vmovdqa xmm11,xmm5 + vmovups XMMWORD[(-48)+rsi],xmm12 + vmovdqa xmm12,xmm6 + vmovups XMMWORD[(-32)+rsi],xmm13 + vmovdqa xmm13,xmm7 + vmovups XMMWORD[(-16)+rsi],xmm14 + vmovdqa xmm14,xmm3 + vmovdqu xmm7,XMMWORD[((32+8))+rsp] + jmp NEAR $L$oop6x + +$L$6x_done: + vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] + vpxor xmm8,xmm8,xmm4 + + DB 0F3h,0C3h ;repret + + +global aesni_gcm_decrypt + +ALIGN 32 +aesni_gcm_decrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesni_gcm_decrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + xor r10,r10 + + + + cmp rdx,0x60 + jb NEAR $L$gcm_dec_abort + + lea rax,[rsp] + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-168))+rsp] + movaps XMMWORD[(-216)+rax],xmm6 + movaps XMMWORD[(-200)+rax],xmm7 + movaps XMMWORD[(-184)+rax],xmm8 + movaps XMMWORD[(-168)+rax],xmm9 + movaps XMMWORD[(-152)+rax],xmm10 + movaps XMMWORD[(-136)+rax],xmm11 + movaps XMMWORD[(-120)+rax],xmm12 + movaps XMMWORD[(-104)+rax],xmm13 + movaps XMMWORD[(-88)+rax],xmm14 + movaps XMMWORD[(-72)+rax],xmm15 +$L$gcm_dec_body: + vzeroupper + + vmovdqu xmm1,XMMWORD[r8] + add rsp,-128 + mov ebx,DWORD[12+r8] + lea r11,[$L$bswap_mask] + lea r14,[((-128))+rcx] + mov r15,0xf80 + vmovdqu xmm8,XMMWORD[r9] + and rsp,-128 + vmovdqu xmm0,XMMWORD[r11] + lea rcx,[128+rcx] + lea r9,[((32+32))+r9] + mov ebp,DWORD[((240-128))+rcx] + vpshufb xmm8,xmm8,xmm0 + + and r14,r15 + and r15,rsp + sub r15,r14 + jc NEAR $L$dec_no_key_aliasing + cmp r15,768 + jnc NEAR $L$dec_no_key_aliasing + sub rsp,r15 +$L$dec_no_key_aliasing: + + vmovdqu xmm7,XMMWORD[80+rdi] + lea r14,[rdi] + vmovdqu xmm4,XMMWORD[64+rdi] + + + + + + + + lea r15,[((-192))+rdx*1+rdi] + + vmovdqu xmm5,XMMWORD[48+rdi] + shr rdx,4 + xor r10,r10 + vmovdqu xmm6,XMMWORD[32+rdi] + vpshufb xmm7,xmm7,xmm0 + vmovdqu xmm2,XMMWORD[16+rdi] + vpshufb xmm4,xmm4,xmm0 + vmovdqu xmm3,XMMWORD[rdi] + vpshufb xmm5,xmm5,xmm0 + vmovdqu XMMWORD[48+rsp],xmm4 + vpshufb xmm6,xmm6,xmm0 + vmovdqu XMMWORD[64+rsp],xmm5 + vpshufb xmm2,xmm2,xmm0 + vmovdqu XMMWORD[80+rsp],xmm6 + vpshufb xmm3,xmm3,xmm0 + vmovdqu XMMWORD[96+rsp],xmm2 + vmovdqu XMMWORD[112+rsp],xmm3 + + call _aesni_ctr32_ghash_6x + + vmovups XMMWORD[(-96)+rsi],xmm9 + vmovups XMMWORD[(-80)+rsi],xmm10 + vmovups XMMWORD[(-64)+rsi],xmm11 + vmovups XMMWORD[(-48)+rsi],xmm12 + vmovups XMMWORD[(-32)+rsi],xmm13 + vmovups XMMWORD[(-16)+rsi],xmm14 + + vpshufb xmm8,xmm8,XMMWORD[r11] + vmovdqu XMMWORD[(-64)+r9],xmm8 + + vzeroupper + movaps xmm6,XMMWORD[((-216))+rax] + movaps xmm7,XMMWORD[((-200))+rax] + movaps xmm8,XMMWORD[((-184))+rax] + movaps xmm9,XMMWORD[((-168))+rax] + movaps xmm10,XMMWORD[((-152))+rax] + movaps xmm11,XMMWORD[((-136))+rax] + movaps xmm12,XMMWORD[((-120))+rax] + movaps xmm13,XMMWORD[((-104))+rax] + movaps xmm14,XMMWORD[((-88))+rax] + movaps xmm15,XMMWORD[((-72))+rax] + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$gcm_dec_abort: + mov rax,r10 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aesni_gcm_decrypt: + +ALIGN 32 +_aesni_ctr32_6x: + + vmovdqu xmm4,XMMWORD[((0-128))+rcx] + vmovdqu xmm2,XMMWORD[32+r11] + lea r13,[((-1))+rbp] + vmovups xmm15,XMMWORD[((16-128))+rcx] + lea r12,[((32-128))+rcx] + vpxor xmm9,xmm1,xmm4 + add ebx,100663296 + jc NEAR $L$handle_ctr32_2 + vpaddb xmm10,xmm1,xmm2 + vpaddb xmm11,xmm10,xmm2 + vpxor xmm10,xmm10,xmm4 + vpaddb xmm12,xmm11,xmm2 + vpxor xmm11,xmm11,xmm4 + vpaddb xmm13,xmm12,xmm2 + vpxor xmm12,xmm12,xmm4 + vpaddb xmm14,xmm13,xmm2 + vpxor xmm13,xmm13,xmm4 + vpaddb xmm1,xmm14,xmm2 + vpxor xmm14,xmm14,xmm4 + jmp NEAR $L$oop_ctr32 + +ALIGN 16 +$L$oop_ctr32: + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + vmovups xmm15,XMMWORD[r12] + lea r12,[16+r12] + dec r13d + jnz NEAR $L$oop_ctr32 + + vmovdqu xmm3,XMMWORD[r12] + vaesenc xmm9,xmm9,xmm15 + vpxor xmm4,xmm3,XMMWORD[rdi] + vaesenc xmm10,xmm10,xmm15 + vpxor xmm5,xmm3,XMMWORD[16+rdi] + vaesenc xmm11,xmm11,xmm15 + vpxor xmm6,xmm3,XMMWORD[32+rdi] + vaesenc xmm12,xmm12,xmm15 + vpxor xmm8,xmm3,XMMWORD[48+rdi] + vaesenc xmm13,xmm13,xmm15 + vpxor xmm2,xmm3,XMMWORD[64+rdi] + vaesenc xmm14,xmm14,xmm15 + vpxor xmm3,xmm3,XMMWORD[80+rdi] + lea rdi,[96+rdi] + + vaesenclast xmm9,xmm9,xmm4 + vaesenclast xmm10,xmm10,xmm5 + vaesenclast xmm11,xmm11,xmm6 + vaesenclast xmm12,xmm12,xmm8 + vaesenclast xmm13,xmm13,xmm2 + vaesenclast xmm14,xmm14,xmm3 + vmovups XMMWORD[rsi],xmm9 + vmovups XMMWORD[16+rsi],xmm10 + vmovups XMMWORD[32+rsi],xmm11 + vmovups XMMWORD[48+rsi],xmm12 + vmovups XMMWORD[64+rsi],xmm13 + vmovups XMMWORD[80+rsi],xmm14 + lea rsi,[96+rsi] + + DB 0F3h,0C3h ;repret +ALIGN 32 +$L$handle_ctr32_2: + vpshufb xmm6,xmm1,xmm0 + vmovdqu xmm5,XMMWORD[48+r11] + vpaddd xmm10,xmm6,XMMWORD[64+r11] + vpaddd xmm11,xmm6,xmm5 + vpaddd xmm12,xmm10,xmm5 + vpshufb xmm10,xmm10,xmm0 + vpaddd xmm13,xmm11,xmm5 + vpshufb xmm11,xmm11,xmm0 + vpxor xmm10,xmm10,xmm4 + vpaddd xmm14,xmm12,xmm5 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm11,xmm11,xmm4 + vpaddd xmm1,xmm13,xmm5 + vpshufb xmm13,xmm13,xmm0 + vpxor xmm12,xmm12,xmm4 + vpshufb xmm14,xmm14,xmm0 + vpxor xmm13,xmm13,xmm4 + vpshufb xmm1,xmm1,xmm0 + vpxor xmm14,xmm14,xmm4 + jmp NEAR $L$oop_ctr32 + + + +global aesni_gcm_encrypt + +ALIGN 32 +aesni_gcm_encrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesni_gcm_encrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + + xor r10,r10 + + + + + cmp rdx,0x60*3 + jb NEAR $L$gcm_enc_abort + + lea rax,[rsp] + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + lea rsp,[((-168))+rsp] + movaps XMMWORD[(-216)+rax],xmm6 + movaps XMMWORD[(-200)+rax],xmm7 + movaps XMMWORD[(-184)+rax],xmm8 + movaps XMMWORD[(-168)+rax],xmm9 + movaps XMMWORD[(-152)+rax],xmm10 + movaps XMMWORD[(-136)+rax],xmm11 + movaps XMMWORD[(-120)+rax],xmm12 + movaps XMMWORD[(-104)+rax],xmm13 + movaps XMMWORD[(-88)+rax],xmm14 + movaps XMMWORD[(-72)+rax],xmm15 +$L$gcm_enc_body: + vzeroupper + + vmovdqu xmm1,XMMWORD[r8] + add rsp,-128 + mov ebx,DWORD[12+r8] + lea r11,[$L$bswap_mask] + lea r14,[((-128))+rcx] + mov r15,0xf80 + lea rcx,[128+rcx] + vmovdqu xmm0,XMMWORD[r11] + and rsp,-128 + mov ebp,DWORD[((240-128))+rcx] + + and r14,r15 + and r15,rsp + sub r15,r14 + jc NEAR $L$enc_no_key_aliasing + cmp r15,768 + jnc NEAR $L$enc_no_key_aliasing + sub rsp,r15 +$L$enc_no_key_aliasing: + + lea r14,[rsi] + + + + + + + + + lea r15,[((-192))+rdx*1+rsi] + + shr rdx,4 + + call _aesni_ctr32_6x + vpshufb xmm8,xmm9,xmm0 + vpshufb xmm2,xmm10,xmm0 + vmovdqu XMMWORD[112+rsp],xmm8 + vpshufb xmm4,xmm11,xmm0 + vmovdqu XMMWORD[96+rsp],xmm2 + vpshufb xmm5,xmm12,xmm0 + vmovdqu XMMWORD[80+rsp],xmm4 + vpshufb xmm6,xmm13,xmm0 + vmovdqu XMMWORD[64+rsp],xmm5 + vpshufb xmm7,xmm14,xmm0 + vmovdqu XMMWORD[48+rsp],xmm6 + + call _aesni_ctr32_6x + + vmovdqu xmm8,XMMWORD[r9] + lea r9,[((32+32))+r9] + sub rdx,12 + mov r10,0x60*2 + vpshufb xmm8,xmm8,xmm0 + + call _aesni_ctr32_ghash_6x + vmovdqu xmm7,XMMWORD[32+rsp] + vmovdqu xmm0,XMMWORD[r11] + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpunpckhqdq xmm1,xmm7,xmm7 + vmovdqu xmm15,XMMWORD[((32-32))+r9] + vmovups XMMWORD[(-96)+rsi],xmm9 + vpshufb xmm9,xmm9,xmm0 + vpxor xmm1,xmm1,xmm7 + vmovups XMMWORD[(-80)+rsi],xmm10 + vpshufb xmm10,xmm10,xmm0 + vmovups XMMWORD[(-64)+rsi],xmm11 + vpshufb xmm11,xmm11,xmm0 + vmovups XMMWORD[(-48)+rsi],xmm12 + vpshufb xmm12,xmm12,xmm0 + vmovups XMMWORD[(-32)+rsi],xmm13 + vpshufb xmm13,xmm13,xmm0 + vmovups XMMWORD[(-16)+rsi],xmm14 + vpshufb xmm14,xmm14,xmm0 + vmovdqu XMMWORD[16+rsp],xmm9 + vmovdqu xmm6,XMMWORD[48+rsp] + vmovdqu xmm0,XMMWORD[((16-32))+r9] + vpunpckhqdq xmm2,xmm6,xmm6 + vpclmulqdq xmm5,xmm7,xmm3,0x00 + vpxor xmm2,xmm2,xmm6 + vpclmulqdq xmm7,xmm7,xmm3,0x11 + vpclmulqdq xmm1,xmm1,xmm15,0x00 + + vmovdqu xmm9,XMMWORD[64+rsp] + vpclmulqdq xmm4,xmm6,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((48-32))+r9] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm5,xmm9,xmm9 + vpclmulqdq xmm6,xmm6,xmm0,0x11 + vpxor xmm5,xmm5,xmm9 + vpxor xmm6,xmm6,xmm7 + vpclmulqdq xmm2,xmm2,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((80-32))+r9] + vpxor xmm2,xmm2,xmm1 + + vmovdqu xmm1,XMMWORD[80+rsp] + vpclmulqdq xmm7,xmm9,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((64-32))+r9] + vpxor xmm7,xmm7,xmm4 + vpunpckhqdq xmm4,xmm1,xmm1 + vpclmulqdq xmm9,xmm9,xmm3,0x11 + vpxor xmm4,xmm4,xmm1 + vpxor xmm9,xmm9,xmm6 + vpclmulqdq xmm5,xmm5,xmm15,0x00 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm2,XMMWORD[96+rsp] + vpclmulqdq xmm6,xmm1,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((96-32))+r9] + vpxor xmm6,xmm6,xmm7 + vpunpckhqdq xmm7,xmm2,xmm2 + vpclmulqdq xmm1,xmm1,xmm0,0x11 + vpxor xmm7,xmm7,xmm2 + vpxor xmm1,xmm1,xmm9 + vpclmulqdq xmm4,xmm4,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((128-32))+r9] + vpxor xmm4,xmm4,xmm5 + + vpxor xmm8,xmm8,XMMWORD[112+rsp] + vpclmulqdq xmm5,xmm2,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((112-32))+r9] + vpunpckhqdq xmm9,xmm8,xmm8 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm2,xmm2,xmm3,0x11 + vpxor xmm9,xmm9,xmm8 + vpxor xmm2,xmm2,xmm1 + vpclmulqdq xmm7,xmm7,xmm15,0x00 + vpxor xmm4,xmm7,xmm4 + + vpclmulqdq xmm6,xmm8,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((0-32))+r9] + vpunpckhqdq xmm1,xmm14,xmm14 + vpclmulqdq xmm8,xmm8,xmm0,0x11 + vpxor xmm1,xmm1,xmm14 + vpxor xmm5,xmm6,xmm5 + vpclmulqdq xmm9,xmm9,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((32-32))+r9] + vpxor xmm7,xmm8,xmm2 + vpxor xmm6,xmm9,xmm4 + + vmovdqu xmm0,XMMWORD[((16-32))+r9] + vpxor xmm9,xmm7,xmm5 + vpclmulqdq xmm4,xmm14,xmm3,0x00 + vpxor xmm6,xmm6,xmm9 + vpunpckhqdq xmm2,xmm13,xmm13 + vpclmulqdq xmm14,xmm14,xmm3,0x11 + vpxor xmm2,xmm2,xmm13 + vpslldq xmm9,xmm6,8 + vpclmulqdq xmm1,xmm1,xmm15,0x00 + vpxor xmm8,xmm5,xmm9 + vpsrldq xmm6,xmm6,8 + vpxor xmm7,xmm7,xmm6 + + vpclmulqdq xmm5,xmm13,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((48-32))+r9] + vpxor xmm5,xmm5,xmm4 + vpunpckhqdq xmm9,xmm12,xmm12 + vpclmulqdq xmm13,xmm13,xmm0,0x11 + vpxor xmm9,xmm9,xmm12 + vpxor xmm13,xmm13,xmm14 + vpalignr xmm14,xmm8,xmm8,8 + vpclmulqdq xmm2,xmm2,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((80-32))+r9] + vpxor xmm2,xmm2,xmm1 + + vpclmulqdq xmm4,xmm12,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((64-32))+r9] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm1,xmm11,xmm11 + vpclmulqdq xmm12,xmm12,xmm3,0x11 + vpxor xmm1,xmm1,xmm11 + vpxor xmm12,xmm12,xmm13 + vxorps xmm7,xmm7,XMMWORD[16+rsp] + vpclmulqdq xmm9,xmm9,xmm15,0x00 + vpxor xmm9,xmm9,xmm2 + + vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 + vxorps xmm8,xmm8,xmm14 + + vpclmulqdq xmm5,xmm11,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((96-32))+r9] + vpxor xmm5,xmm5,xmm4 + vpunpckhqdq xmm2,xmm10,xmm10 + vpclmulqdq xmm11,xmm11,xmm0,0x11 + vpxor xmm2,xmm2,xmm10 + vpalignr xmm14,xmm8,xmm8,8 + vpxor xmm11,xmm11,xmm12 + vpclmulqdq xmm1,xmm1,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((128-32))+r9] + vpxor xmm1,xmm1,xmm9 + + vxorps xmm14,xmm14,xmm7 + vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 + vxorps xmm8,xmm8,xmm14 + + vpclmulqdq xmm4,xmm10,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((112-32))+r9] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm9,xmm8,xmm8 + vpclmulqdq xmm10,xmm10,xmm3,0x11 + vpxor xmm9,xmm9,xmm8 + vpxor xmm10,xmm10,xmm11 + vpclmulqdq xmm2,xmm2,xmm15,0x00 + vpxor xmm2,xmm2,xmm1 + + vpclmulqdq xmm5,xmm8,xmm0,0x00 + vpclmulqdq xmm7,xmm8,xmm0,0x11 + vpxor xmm5,xmm5,xmm4 + vpclmulqdq xmm6,xmm9,xmm15,0x10 + vpxor xmm7,xmm7,xmm10 + vpxor xmm6,xmm6,xmm2 + + vpxor xmm4,xmm7,xmm5 + vpxor xmm6,xmm6,xmm4 + vpslldq xmm1,xmm6,8 + vmovdqu xmm3,XMMWORD[16+r11] + vpsrldq xmm6,xmm6,8 + vpxor xmm8,xmm5,xmm1 + vpxor xmm7,xmm7,xmm6 + + vpalignr xmm2,xmm8,xmm8,8 + vpclmulqdq xmm8,xmm8,xmm3,0x10 + vpxor xmm8,xmm8,xmm2 + + vpalignr xmm2,xmm8,xmm8,8 + vpclmulqdq xmm8,xmm8,xmm3,0x10 + vpxor xmm2,xmm2,xmm7 + vpxor xmm8,xmm8,xmm2 + vpshufb xmm8,xmm8,XMMWORD[r11] + vmovdqu XMMWORD[(-64)+r9],xmm8 + + vzeroupper + movaps xmm6,XMMWORD[((-216))+rax] + movaps xmm7,XMMWORD[((-200))+rax] + movaps xmm8,XMMWORD[((-184))+rax] + movaps xmm9,XMMWORD[((-168))+rax] + movaps xmm10,XMMWORD[((-152))+rax] + movaps xmm11,XMMWORD[((-136))+rax] + movaps xmm12,XMMWORD[((-120))+rax] + movaps xmm13,XMMWORD[((-104))+rax] + movaps xmm14,XMMWORD[((-88))+rax] + movaps xmm15,XMMWORD[((-72))+rax] + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$gcm_enc_abort: + mov rax,r10 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_aesni_gcm_encrypt: +ALIGN 64 +$L$bswap_mask: +DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$poly: +DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +$L$one_msb: +DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +$L$two_lsb: +DB 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +$L$one_lsb: +DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +DB 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108 +DB 101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82 +DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 +DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +ALIGN 64 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +gcm_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[120+r8] + + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + mov QWORD[240+r8],r15 + mov QWORD[232+r8],r14 + mov QWORD[224+r8],r13 + mov QWORD[216+r8],r12 + mov QWORD[160+r8],rbp + mov QWORD[144+r8],rbx + + lea rsi,[((-216))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_aesni_gcm_decrypt wrt ..imagebase + DD $L$SEH_end_aesni_gcm_decrypt wrt ..imagebase + DD $L$SEH_gcm_dec_info wrt ..imagebase + + DD $L$SEH_begin_aesni_gcm_encrypt wrt ..imagebase + DD $L$SEH_end_aesni_gcm_encrypt wrt ..imagebase + DD $L$SEH_gcm_enc_info wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_gcm_dec_info: +DB 9,0,0,0 + DD gcm_se_handler wrt ..imagebase + DD $L$gcm_dec_body wrt ..imagebase,$L$gcm_dec_abort wrt ..imagebase +$L$SEH_gcm_enc_info: +DB 9,0,0,0 + DD gcm_se_handler wrt ..imagebase + DD $L$gcm_enc_body wrt ..imagebase,$L$gcm_enc_abort wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/aes/aesni-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm similarity index 77% rename from packager/third_party/boringssl/win-x86_64/crypto/aes/aesni-x86_64.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm index cf313d1ae9..13e9c5e5b6 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/aes/aesni-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/aesni-x86_64.asm @@ -1129,22 +1129,21 @@ DB 102,15,56,221,209 ALIGN 16 $L$ctr32_bulk: - lea rax,[rsp] + lea r11,[rsp] push rbp sub rsp,288 and rsp,-16 - movaps XMMWORD[(-168)+rax],xmm6 - movaps XMMWORD[(-152)+rax],xmm7 - movaps XMMWORD[(-136)+rax],xmm8 - movaps XMMWORD[(-120)+rax],xmm9 - movaps XMMWORD[(-104)+rax],xmm10 - movaps XMMWORD[(-88)+rax],xmm11 - movaps XMMWORD[(-72)+rax],xmm12 - movaps XMMWORD[(-56)+rax],xmm13 - movaps XMMWORD[(-40)+rax],xmm14 - movaps XMMWORD[(-24)+rax],xmm15 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 $L$ctr32_body: - lea rbp,[((-8))+rax] @@ -1153,7 +1152,7 @@ $L$ctr32_body: movdqu xmm0,XMMWORD[rcx] mov r8d,DWORD[12+r8] pxor xmm2,xmm0 - mov r11d,DWORD[12+rcx] + mov ebp,DWORD[12+rcx] movdqa XMMWORD[rsp],xmm2 bswap r8d movdqa xmm3,xmm2 @@ -1169,8 +1168,8 @@ $L$ctr32_body: lea rdx,[2+r8] bswap eax bswap edx - xor eax,r11d - xor edx,r11d + xor eax,ebp + xor edx,ebp DB 102,15,58,34,216,3 lea rax,[3+r8] movdqa XMMWORD[16+rsp],xmm3 @@ -1179,25 +1178,26 @@ DB 102,15,58,34,226,3 mov rdx,r10 lea r10,[4+r8] movdqa XMMWORD[32+rsp],xmm4 - xor eax,r11d + xor eax,ebp bswap r10d DB 102,15,58,34,232,3 - xor r10d,r11d + xor r10d,ebp movdqa XMMWORD[48+rsp],xmm5 lea r9,[5+r8] mov DWORD[((64+12))+rsp],r10d bswap r9d lea r10,[6+r8] mov eax,DWORD[240+rcx] - xor r9d,r11d + xor r9d,ebp bswap r10d mov DWORD[((80+12))+rsp],r9d - xor r10d,r11d + xor r10d,ebp lea r9,[7+r8] mov DWORD[((96+12))+rsp],r10d bswap r9d - mov r10d,DWORD[((OPENSSL_ia32cap_P+4))] - xor r9d,r11d + lea r10,[OPENSSL_ia32cap_P] + mov r10d,DWORD[4+r10] + xor r9d,ebp and r10d,71303168 mov DWORD[((112+12))+rsp],r9d @@ -1221,7 +1221,7 @@ ALIGN 16 $L$ctr32_6x: shl eax,4 mov r10d,48 - bswap r11d + bswap ebp lea rcx,[32+rax*1+rcx] sub r10,rax jmp NEAR $L$ctr32_loop6 @@ -1232,32 +1232,32 @@ $L$ctr32_loop6: movups xmm0,XMMWORD[((-48))+r10*1+rcx] DB 102,15,56,220,209 mov eax,r8d - xor eax,r11d + xor eax,ebp DB 102,15,56,220,217 DB 0x0f,0x38,0xf1,0x44,0x24,12 lea eax,[1+r8] DB 102,15,56,220,225 - xor eax,r11d + xor eax,ebp DB 0x0f,0x38,0xf1,0x44,0x24,28 DB 102,15,56,220,233 lea eax,[2+r8] - xor eax,r11d + xor eax,ebp DB 102,15,56,220,241 DB 0x0f,0x38,0xf1,0x44,0x24,44 lea eax,[3+r8] DB 102,15,56,220,249 movups xmm1,XMMWORD[((-32))+r10*1+rcx] - xor eax,r11d + xor eax,ebp DB 102,15,56,220,208 DB 0x0f,0x38,0xf1,0x44,0x24,60 lea eax,[4+r8] DB 102,15,56,220,216 - xor eax,r11d + xor eax,ebp DB 0x0f,0x38,0xf1,0x44,0x24,76 DB 102,15,56,220,224 lea eax,[5+r8] - xor eax,r11d + xor eax,ebp DB 102,15,56,220,232 DB 0x0f,0x38,0xf1,0x44,0x24,92 mov rax,r10 @@ -1318,7 +1318,7 @@ DB 102,15,56,220,217 bswap r9d movups xmm0,XMMWORD[((32-128))+rcx] DB 102,15,56,220,225 - xor r9d,r11d + xor r9d,ebp nop DB 102,15,56,220,233 mov DWORD[((0+12))+rsp],r9d @@ -1331,7 +1331,7 @@ DB 102,68,15,56,220,201 bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 @@ -1345,7 +1345,7 @@ DB 102,68,15,56,220,200 bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 @@ -1359,7 +1359,7 @@ DB 102,68,15,56,220,201 bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 @@ -1373,7 +1373,7 @@ DB 102,68,15,56,220,200 bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 @@ -1387,7 +1387,7 @@ DB 102,68,15,56,220,201 bswap r9d DB 102,15,56,220,208 DB 102,15,56,220,216 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,224 DB 102,15,56,220,232 @@ -1401,7 +1401,7 @@ DB 102,68,15,56,220,200 bswap r9d DB 102,15,56,220,209 DB 102,15,56,220,217 - xor r9d,r11d + xor r9d,ebp DB 0x66,0x90 DB 102,15,56,220,225 DB 102,15,56,220,233 @@ -1416,7 +1416,7 @@ DB 102,68,15,56,220,201 DB 102,15,56,220,208 DB 102,15,56,220,216 DB 102,15,56,220,224 - xor r9d,r11d + xor r9d,ebp movdqu xmm10,XMMWORD[rdi] DB 102,15,56,220,232 mov DWORD[((112+12))+rsp],r9d @@ -1651,32 +1651,32 @@ DB 102,15,56,221,225 $L$ctr32_done: xorps xmm0,xmm0 - xor r11d,r11d + xor ebp,ebp pxor xmm1,xmm1 pxor xmm2,xmm2 pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-160))+rbp] - movaps XMMWORD[(-160)+rbp],xmm0 - movaps xmm7,XMMWORD[((-144))+rbp] - movaps XMMWORD[(-144)+rbp],xmm0 - movaps xmm8,XMMWORD[((-128))+rbp] - movaps XMMWORD[(-128)+rbp],xmm0 - movaps xmm9,XMMWORD[((-112))+rbp] - movaps XMMWORD[(-112)+rbp],xmm0 - movaps xmm10,XMMWORD[((-96))+rbp] - movaps XMMWORD[(-96)+rbp],xmm0 - movaps xmm11,XMMWORD[((-80))+rbp] - movaps XMMWORD[(-80)+rbp],xmm0 - movaps xmm12,XMMWORD[((-64))+rbp] - movaps XMMWORD[(-64)+rbp],xmm0 - movaps xmm13,XMMWORD[((-48))+rbp] - movaps XMMWORD[(-48)+rbp],xmm0 - movaps xmm14,XMMWORD[((-32))+rbp] - movaps XMMWORD[(-32)+rbp],xmm0 - movaps xmm15,XMMWORD[((-16))+rbp] - movaps XMMWORD[(-16)+rbp],xmm0 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 movaps XMMWORD[16+rsp],xmm0 movaps XMMWORD[32+rsp],xmm0 @@ -1685,8 +1685,8 @@ $L$ctr32_done: movaps XMMWORD[80+rsp],xmm0 movaps XMMWORD[96+rsp],xmm0 movaps XMMWORD[112+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] $L$ctr32_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -1708,22 +1708,21 @@ $L$SEH_begin_aesni_xts_encrypt: mov r9,QWORD[48+rsp] - lea rax,[rsp] + lea r11,[rsp] push rbp sub rsp,272 and rsp,-16 - movaps XMMWORD[(-168)+rax],xmm6 - movaps XMMWORD[(-152)+rax],xmm7 - movaps XMMWORD[(-136)+rax],xmm8 - movaps XMMWORD[(-120)+rax],xmm9 - movaps XMMWORD[(-104)+rax],xmm10 - movaps XMMWORD[(-88)+rax],xmm11 - movaps XMMWORD[(-72)+rax],xmm12 - movaps XMMWORD[(-56)+rax],xmm13 - movaps XMMWORD[(-40)+rax],xmm14 - movaps XMMWORD[(-24)+rax],xmm15 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 $L$xts_enc_body: - lea rbp,[((-8))+rax] movups xmm2,XMMWORD[r9] mov eax,DWORD[240+r8] mov r10d,DWORD[240+rcx] @@ -1739,7 +1738,7 @@ DB 102,15,56,220,209 jnz NEAR $L$oop_enc1_8 DB 102,15,56,221,209 movups xmm0,XMMWORD[rcx] - mov r11,rcx + mov rbp,rcx mov eax,r10d shl r10d,4 mov r9,rdx @@ -1795,9 +1794,9 @@ DB 102,15,56,221,209 jc NEAR $L$xts_enc_short mov eax,16+96 - lea rcx,[32+r10*1+r11] + lea rcx,[32+r10*1+rbp] sub rax,r10 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] mov r10,rax lea r8,[$L$xts_magic] jmp NEAR $L$xts_enc_grandloop @@ -1822,7 +1821,7 @@ DB 102,15,56,220,225 movdqa xmm9,XMMWORD[96+rsp] pxor xmm6,xmm14 DB 102,15,56,220,233 - movups xmm0,XMMWORD[32+r11] + movups xmm0,XMMWORD[32+rbp] lea rdi,[96+rdi] pxor xmm7,xmm8 @@ -1831,7 +1830,7 @@ DB 102,15,56,220,241 pxor xmm11,xmm9 movdqa XMMWORD[rsp],xmm10 DB 102,15,56,220,249 - movups xmm1,XMMWORD[48+r11] + movups xmm1,XMMWORD[48+rbp] pxor xmm12,xmm9 DB 102,15,56,220,208 @@ -1846,7 +1845,7 @@ DB 102,15,56,220,232 movdqa XMMWORD[64+rsp],xmm14 DB 102,15,56,220,240 DB 102,15,56,220,248 - movups xmm0,XMMWORD[64+r11] + movups xmm0,XMMWORD[64+rbp] movdqa XMMWORD[80+rsp],xmm8 pshufd xmm9,xmm15,0x5f jmp NEAR $L$xts_enc_loop6 @@ -1878,7 +1877,7 @@ DB 102,15,56,220,209 psrad xmm14,31 DB 102,15,56,220,217 pand xmm14,xmm8 - movups xmm10,XMMWORD[r11] + movups xmm10,XMMWORD[rbp] DB 102,15,56,220,225 DB 102,15,56,220,233 DB 102,15,56,220,241 @@ -1946,10 +1945,10 @@ DB 102,15,56,220,217 DB 102,15,56,220,225 DB 102,15,56,220,233 pxor xmm15,xmm0 - movups xmm0,XMMWORD[r11] + movups xmm0,XMMWORD[rbp] DB 102,15,56,220,241 DB 102,15,56,220,249 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] pxor xmm14,xmm15 DB 102,15,56,221,84,36,0 @@ -1976,7 +1975,7 @@ DB 102,15,56,221,124,36,80 mov eax,16+96 sub eax,r10d - mov rcx,r11 + mov rcx,rbp shr eax,4 $L$xts_enc_short: @@ -2132,7 +2131,7 @@ $L$xts_enc_steal: jnz NEAR $L$xts_enc_steal sub rsi,r9 - mov rcx,r11 + mov rcx,rbp mov eax,r10d movups xmm2,XMMWORD[((-16))+rsi] @@ -2158,26 +2157,26 @@ $L$xts_enc_ret: pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-160))+rbp] - movaps XMMWORD[(-160)+rbp],xmm0 - movaps xmm7,XMMWORD[((-144))+rbp] - movaps XMMWORD[(-144)+rbp],xmm0 - movaps xmm8,XMMWORD[((-128))+rbp] - movaps XMMWORD[(-128)+rbp],xmm0 - movaps xmm9,XMMWORD[((-112))+rbp] - movaps XMMWORD[(-112)+rbp],xmm0 - movaps xmm10,XMMWORD[((-96))+rbp] - movaps XMMWORD[(-96)+rbp],xmm0 - movaps xmm11,XMMWORD[((-80))+rbp] - movaps XMMWORD[(-80)+rbp],xmm0 - movaps xmm12,XMMWORD[((-64))+rbp] - movaps XMMWORD[(-64)+rbp],xmm0 - movaps xmm13,XMMWORD[((-48))+rbp] - movaps XMMWORD[(-48)+rbp],xmm0 - movaps xmm14,XMMWORD[((-32))+rbp] - movaps XMMWORD[(-32)+rbp],xmm0 - movaps xmm15,XMMWORD[((-16))+rbp] - movaps XMMWORD[(-16)+rbp],xmm0 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 movaps XMMWORD[16+rsp],xmm0 movaps XMMWORD[32+rsp],xmm0 @@ -2185,8 +2184,8 @@ $L$xts_enc_ret: movaps XMMWORD[64+rsp],xmm0 movaps XMMWORD[80+rsp],xmm0 movaps XMMWORD[96+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] $L$xts_enc_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -2208,22 +2207,21 @@ $L$SEH_begin_aesni_xts_decrypt: mov r9,QWORD[48+rsp] - lea rax,[rsp] + lea r11,[rsp] push rbp sub rsp,272 and rsp,-16 - movaps XMMWORD[(-168)+rax],xmm6 - movaps XMMWORD[(-152)+rax],xmm7 - movaps XMMWORD[(-136)+rax],xmm8 - movaps XMMWORD[(-120)+rax],xmm9 - movaps XMMWORD[(-104)+rax],xmm10 - movaps XMMWORD[(-88)+rax],xmm11 - movaps XMMWORD[(-72)+rax],xmm12 - movaps XMMWORD[(-56)+rax],xmm13 - movaps XMMWORD[(-40)+rax],xmm14 - movaps XMMWORD[(-24)+rax],xmm15 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 $L$xts_dec_body: - lea rbp,[((-8))+rax] movups xmm2,XMMWORD[r9] mov eax,DWORD[240+r8] mov r10d,DWORD[240+rcx] @@ -2245,7 +2243,7 @@ DB 102,15,56,221,209 sub rdx,rax movups xmm0,XMMWORD[rcx] - mov r11,rcx + mov rbp,rcx mov eax,r10d shl r10d,4 mov r9,rdx @@ -2301,9 +2299,9 @@ DB 102,15,56,221,209 jc NEAR $L$xts_dec_short mov eax,16+96 - lea rcx,[32+r10*1+r11] + lea rcx,[32+r10*1+rbp] sub rax,r10 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] mov r10,rax lea r8,[$L$xts_magic] jmp NEAR $L$xts_dec_grandloop @@ -2328,7 +2326,7 @@ DB 102,15,56,222,225 movdqa xmm9,XMMWORD[96+rsp] pxor xmm6,xmm14 DB 102,15,56,222,233 - movups xmm0,XMMWORD[32+r11] + movups xmm0,XMMWORD[32+rbp] lea rdi,[96+rdi] pxor xmm7,xmm8 @@ -2337,7 +2335,7 @@ DB 102,15,56,222,241 pxor xmm11,xmm9 movdqa XMMWORD[rsp],xmm10 DB 102,15,56,222,249 - movups xmm1,XMMWORD[48+r11] + movups xmm1,XMMWORD[48+rbp] pxor xmm12,xmm9 DB 102,15,56,222,208 @@ -2352,7 +2350,7 @@ DB 102,15,56,222,232 movdqa XMMWORD[64+rsp],xmm14 DB 102,15,56,222,240 DB 102,15,56,222,248 - movups xmm0,XMMWORD[64+r11] + movups xmm0,XMMWORD[64+rbp] movdqa XMMWORD[80+rsp],xmm8 pshufd xmm9,xmm15,0x5f jmp NEAR $L$xts_dec_loop6 @@ -2384,7 +2382,7 @@ DB 102,15,56,222,209 psrad xmm14,31 DB 102,15,56,222,217 pand xmm14,xmm8 - movups xmm10,XMMWORD[r11] + movups xmm10,XMMWORD[rbp] DB 102,15,56,222,225 DB 102,15,56,222,233 DB 102,15,56,222,241 @@ -2452,10 +2450,10 @@ DB 102,15,56,222,217 DB 102,15,56,222,225 DB 102,15,56,222,233 pxor xmm15,xmm0 - movups xmm0,XMMWORD[r11] + movups xmm0,XMMWORD[rbp] DB 102,15,56,222,241 DB 102,15,56,222,249 - movups xmm1,XMMWORD[16+r11] + movups xmm1,XMMWORD[16+rbp] pxor xmm14,xmm15 DB 102,15,56,223,84,36,0 @@ -2482,7 +2480,7 @@ DB 102,15,56,223,124,36,80 mov eax,16+96 sub eax,r10d - mov rcx,r11 + mov rcx,rbp shr eax,4 $L$xts_dec_short: @@ -2639,7 +2637,7 @@ $L$xts_dec_done: jz NEAR $L$xts_dec_ret $L$xts_dec_done2: mov rdx,r9 - mov rcx,r11 + mov rcx,rbp mov eax,r10d movups xmm2,XMMWORD[rdi] @@ -2669,7 +2667,7 @@ $L$xts_dec_steal: jnz NEAR $L$xts_dec_steal sub rsi,r9 - mov rcx,r11 + mov rcx,rbp mov eax,r10d movups xmm2,XMMWORD[rsi] @@ -2695,26 +2693,26 @@ $L$xts_dec_ret: pxor xmm3,xmm3 pxor xmm4,xmm4 pxor xmm5,xmm5 - movaps xmm6,XMMWORD[((-160))+rbp] - movaps XMMWORD[(-160)+rbp],xmm0 - movaps xmm7,XMMWORD[((-144))+rbp] - movaps XMMWORD[(-144)+rbp],xmm0 - movaps xmm8,XMMWORD[((-128))+rbp] - movaps XMMWORD[(-128)+rbp],xmm0 - movaps xmm9,XMMWORD[((-112))+rbp] - movaps XMMWORD[(-112)+rbp],xmm0 - movaps xmm10,XMMWORD[((-96))+rbp] - movaps XMMWORD[(-96)+rbp],xmm0 - movaps xmm11,XMMWORD[((-80))+rbp] - movaps XMMWORD[(-80)+rbp],xmm0 - movaps xmm12,XMMWORD[((-64))+rbp] - movaps XMMWORD[(-64)+rbp],xmm0 - movaps xmm13,XMMWORD[((-48))+rbp] - movaps XMMWORD[(-48)+rbp],xmm0 - movaps xmm14,XMMWORD[((-32))+rbp] - movaps XMMWORD[(-32)+rbp],xmm0 - movaps xmm15,XMMWORD[((-16))+rbp] - movaps XMMWORD[(-16)+rbp],xmm0 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 movaps XMMWORD[rsp],xmm0 movaps XMMWORD[16+rsp],xmm0 movaps XMMWORD[32+rsp],xmm0 @@ -2722,13 +2720,901 @@ $L$xts_dec_ret: movaps XMMWORD[64+rsp],xmm0 movaps XMMWORD[80+rsp],xmm0 movaps XMMWORD[96+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] $L$xts_dec_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret $L$SEH_end_aesni_xts_decrypt: +global aesni_ocb_encrypt + +ALIGN 32 +aesni_ocb_encrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesni_ocb_encrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + lea rax,[rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] + movaps XMMWORD[rsp],xmm6 + movaps XMMWORD[16+rsp],xmm7 + movaps XMMWORD[32+rsp],xmm8 + movaps XMMWORD[48+rsp],xmm9 + movaps XMMWORD[64+rsp],xmm10 + movaps XMMWORD[80+rsp],xmm11 + movaps XMMWORD[96+rsp],xmm12 + movaps XMMWORD[112+rsp],xmm13 + movaps XMMWORD[128+rsp],xmm14 + movaps XMMWORD[144+rsp],xmm15 +$L$ocb_enc_body: + mov rbx,QWORD[56+rax] + mov rbp,QWORD[((56+8))+rax] + + mov r10d,DWORD[240+rcx] + mov r11,rcx + shl r10d,4 + movups xmm9,XMMWORD[rcx] + movups xmm1,XMMWORD[16+r10*1+rcx] + + movdqu xmm15,XMMWORD[r9] + pxor xmm9,xmm1 + pxor xmm15,xmm1 + + mov eax,16+32 + lea rcx,[32+r10*1+r11] + movups xmm1,XMMWORD[16+r11] + sub rax,r10 + mov r10,rax + + movdqu xmm10,XMMWORD[rbx] + movdqu xmm8,XMMWORD[rbp] + + test r8,1 + jnz NEAR $L$ocb_enc_odd + + bsf r12,r8 + add r8,1 + shl r12,4 + movdqu xmm7,XMMWORD[r12*1+rbx] + movdqu xmm2,XMMWORD[rdi] + lea rdi,[16+rdi] + + call __ocb_encrypt1 + + movdqa xmm15,xmm7 + movups XMMWORD[rsi],xmm2 + lea rsi,[16+rsi] + sub rdx,1 + jz NEAR $L$ocb_enc_done + +$L$ocb_enc_odd: + lea r12,[1+r8] + lea r13,[3+r8] + lea r14,[5+r8] + lea r8,[6+r8] + bsf r12,r12 + bsf r13,r13 + bsf r14,r14 + shl r12,4 + shl r13,4 + shl r14,4 + + sub rdx,6 + jc NEAR $L$ocb_enc_short + jmp NEAR $L$ocb_enc_grandloop + +ALIGN 32 +$L$ocb_enc_grandloop: + movdqu xmm2,XMMWORD[rdi] + movdqu xmm3,XMMWORD[16+rdi] + movdqu xmm4,XMMWORD[32+rdi] + movdqu xmm5,XMMWORD[48+rdi] + movdqu xmm6,XMMWORD[64+rdi] + movdqu xmm7,XMMWORD[80+rdi] + lea rdi,[96+rdi] + + call __ocb_encrypt6 + + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + movups XMMWORD[64+rsi],xmm6 + movups XMMWORD[80+rsi],xmm7 + lea rsi,[96+rsi] + sub rdx,6 + jnc NEAR $L$ocb_enc_grandloop + +$L$ocb_enc_short: + add rdx,6 + jz NEAR $L$ocb_enc_done + + movdqu xmm2,XMMWORD[rdi] + cmp rdx,2 + jb NEAR $L$ocb_enc_one + movdqu xmm3,XMMWORD[16+rdi] + je NEAR $L$ocb_enc_two + + movdqu xmm4,XMMWORD[32+rdi] + cmp rdx,4 + jb NEAR $L$ocb_enc_three + movdqu xmm5,XMMWORD[48+rdi] + je NEAR $L$ocb_enc_four + + movdqu xmm6,XMMWORD[64+rdi] + pxor xmm7,xmm7 + + call __ocb_encrypt6 + + movdqa xmm15,xmm14 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + movups XMMWORD[64+rsi],xmm6 + + jmp NEAR $L$ocb_enc_done + +ALIGN 16 +$L$ocb_enc_one: + movdqa xmm7,xmm10 + + call __ocb_encrypt1 + + movdqa xmm15,xmm7 + movups XMMWORD[rsi],xmm2 + jmp NEAR $L$ocb_enc_done + +ALIGN 16 +$L$ocb_enc_two: + pxor xmm4,xmm4 + pxor xmm5,xmm5 + + call __ocb_encrypt4 + + movdqa xmm15,xmm11 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + + jmp NEAR $L$ocb_enc_done + +ALIGN 16 +$L$ocb_enc_three: + pxor xmm5,xmm5 + + call __ocb_encrypt4 + + movdqa xmm15,xmm12 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + + jmp NEAR $L$ocb_enc_done + +ALIGN 16 +$L$ocb_enc_four: + call __ocb_encrypt4 + + movdqa xmm15,xmm13 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + +$L$ocb_enc_done: + pxor xmm15,xmm0 + movdqu XMMWORD[rbp],xmm8 + movdqu XMMWORD[r9],xmm15 + + xorps xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movaps xmm6,XMMWORD[rsp] + movaps XMMWORD[rsp],xmm0 + movaps xmm7,XMMWORD[16+rsp] + movaps XMMWORD[16+rsp],xmm0 + movaps xmm8,XMMWORD[32+rsp] + movaps XMMWORD[32+rsp],xmm0 + movaps xmm9,XMMWORD[48+rsp] + movaps XMMWORD[48+rsp],xmm0 + movaps xmm10,XMMWORD[64+rsp] + movaps XMMWORD[64+rsp],xmm0 + movaps xmm11,XMMWORD[80+rsp] + movaps XMMWORD[80+rsp],xmm0 + movaps xmm12,XMMWORD[96+rsp] + movaps XMMWORD[96+rsp],xmm0 + movaps xmm13,XMMWORD[112+rsp] + movaps XMMWORD[112+rsp],xmm0 + movaps xmm14,XMMWORD[128+rsp] + movaps XMMWORD[128+rsp],xmm0 + movaps xmm15,XMMWORD[144+rsp] + movaps XMMWORD[144+rsp],xmm0 + lea rax,[((160+40))+rsp] +$L$ocb_enc_pop: + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] +$L$ocb_enc_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_aesni_ocb_encrypt: + + +ALIGN 32 +__ocb_encrypt6: + pxor xmm15,xmm9 + movdqu xmm11,XMMWORD[r12*1+rbx] + movdqa xmm12,xmm10 + movdqu xmm13,XMMWORD[r13*1+rbx] + movdqa xmm14,xmm10 + pxor xmm10,xmm15 + movdqu xmm15,XMMWORD[r14*1+rbx] + pxor xmm11,xmm10 + pxor xmm8,xmm2 + pxor xmm2,xmm10 + pxor xmm12,xmm11 + pxor xmm8,xmm3 + pxor xmm3,xmm11 + pxor xmm13,xmm12 + pxor xmm8,xmm4 + pxor xmm4,xmm12 + pxor xmm14,xmm13 + pxor xmm8,xmm5 + pxor xmm5,xmm13 + pxor xmm15,xmm14 + pxor xmm8,xmm6 + pxor xmm6,xmm14 + pxor xmm8,xmm7 + pxor xmm7,xmm15 + movups xmm0,XMMWORD[32+r11] + + lea r12,[1+r8] + lea r13,[3+r8] + lea r14,[5+r8] + add r8,6 + pxor xmm10,xmm9 + bsf r12,r12 + bsf r13,r13 + bsf r14,r14 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + pxor xmm11,xmm9 + pxor xmm12,xmm9 +DB 102,15,56,220,241 + pxor xmm13,xmm9 + pxor xmm14,xmm9 +DB 102,15,56,220,249 + movups xmm1,XMMWORD[48+r11] + pxor xmm15,xmm9 + +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 +DB 102,15,56,220,240 +DB 102,15,56,220,248 + movups xmm0,XMMWORD[64+r11] + shl r12,4 + shl r13,4 + jmp NEAR $L$ocb_enc_loop6 + +ALIGN 32 +$L$ocb_enc_loop6: +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 +DB 102,15,56,220,240 +DB 102,15,56,220,248 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_enc_loop6 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 +DB 102,15,56,220,241 +DB 102,15,56,220,249 + movups xmm1,XMMWORD[16+r11] + shl r14,4 + +DB 102,65,15,56,221,210 + movdqu xmm10,XMMWORD[rbx] + mov rax,r10 +DB 102,65,15,56,221,219 +DB 102,65,15,56,221,228 +DB 102,65,15,56,221,237 +DB 102,65,15,56,221,246 +DB 102,65,15,56,221,255 + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ocb_encrypt4: + pxor xmm15,xmm9 + movdqu xmm11,XMMWORD[r12*1+rbx] + movdqa xmm12,xmm10 + movdqu xmm13,XMMWORD[r13*1+rbx] + pxor xmm10,xmm15 + pxor xmm11,xmm10 + pxor xmm8,xmm2 + pxor xmm2,xmm10 + pxor xmm12,xmm11 + pxor xmm8,xmm3 + pxor xmm3,xmm11 + pxor xmm13,xmm12 + pxor xmm8,xmm4 + pxor xmm4,xmm12 + pxor xmm8,xmm5 + pxor xmm5,xmm13 + movups xmm0,XMMWORD[32+r11] + + pxor xmm10,xmm9 + pxor xmm11,xmm9 + pxor xmm12,xmm9 + pxor xmm13,xmm9 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + movups xmm1,XMMWORD[48+r11] + +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 + movups xmm0,XMMWORD[64+r11] + jmp NEAR $L$ocb_enc_loop4 + +ALIGN 32 +$L$ocb_enc_loop4: +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,220,208 +DB 102,15,56,220,216 +DB 102,15,56,220,224 +DB 102,15,56,220,232 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_enc_loop4 + +DB 102,15,56,220,209 +DB 102,15,56,220,217 +DB 102,15,56,220,225 +DB 102,15,56,220,233 + movups xmm1,XMMWORD[16+r11] + mov rax,r10 + +DB 102,65,15,56,221,210 +DB 102,65,15,56,221,219 +DB 102,65,15,56,221,228 +DB 102,65,15,56,221,237 + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ocb_encrypt1: + pxor xmm7,xmm15 + pxor xmm7,xmm9 + pxor xmm8,xmm2 + pxor xmm2,xmm7 + movups xmm0,XMMWORD[32+r11] + +DB 102,15,56,220,209 + movups xmm1,XMMWORD[48+r11] + pxor xmm7,xmm9 + +DB 102,15,56,220,208 + movups xmm0,XMMWORD[64+r11] + jmp NEAR $L$ocb_enc_loop1 + +ALIGN 32 +$L$ocb_enc_loop1: +DB 102,15,56,220,209 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,220,208 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_enc_loop1 + +DB 102,15,56,220,209 + movups xmm1,XMMWORD[16+r11] + mov rax,r10 + +DB 102,15,56,221,215 + DB 0F3h,0C3h ;repret + + +global aesni_ocb_decrypt + +ALIGN 32 +aesni_ocb_decrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesni_ocb_decrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + lea rax,[rsp] + push rbx + push rbp + push r12 + push r13 + push r14 + lea rsp,[((-160))+rsp] + movaps XMMWORD[rsp],xmm6 + movaps XMMWORD[16+rsp],xmm7 + movaps XMMWORD[32+rsp],xmm8 + movaps XMMWORD[48+rsp],xmm9 + movaps XMMWORD[64+rsp],xmm10 + movaps XMMWORD[80+rsp],xmm11 + movaps XMMWORD[96+rsp],xmm12 + movaps XMMWORD[112+rsp],xmm13 + movaps XMMWORD[128+rsp],xmm14 + movaps XMMWORD[144+rsp],xmm15 +$L$ocb_dec_body: + mov rbx,QWORD[56+rax] + mov rbp,QWORD[((56+8))+rax] + + mov r10d,DWORD[240+rcx] + mov r11,rcx + shl r10d,4 + movups xmm9,XMMWORD[rcx] + movups xmm1,XMMWORD[16+r10*1+rcx] + + movdqu xmm15,XMMWORD[r9] + pxor xmm9,xmm1 + pxor xmm15,xmm1 + + mov eax,16+32 + lea rcx,[32+r10*1+r11] + movups xmm1,XMMWORD[16+r11] + sub rax,r10 + mov r10,rax + + movdqu xmm10,XMMWORD[rbx] + movdqu xmm8,XMMWORD[rbp] + + test r8,1 + jnz NEAR $L$ocb_dec_odd + + bsf r12,r8 + add r8,1 + shl r12,4 + movdqu xmm7,XMMWORD[r12*1+rbx] + movdqu xmm2,XMMWORD[rdi] + lea rdi,[16+rdi] + + call __ocb_decrypt1 + + movdqa xmm15,xmm7 + movups XMMWORD[rsi],xmm2 + xorps xmm8,xmm2 + lea rsi,[16+rsi] + sub rdx,1 + jz NEAR $L$ocb_dec_done + +$L$ocb_dec_odd: + lea r12,[1+r8] + lea r13,[3+r8] + lea r14,[5+r8] + lea r8,[6+r8] + bsf r12,r12 + bsf r13,r13 + bsf r14,r14 + shl r12,4 + shl r13,4 + shl r14,4 + + sub rdx,6 + jc NEAR $L$ocb_dec_short + jmp NEAR $L$ocb_dec_grandloop + +ALIGN 32 +$L$ocb_dec_grandloop: + movdqu xmm2,XMMWORD[rdi] + movdqu xmm3,XMMWORD[16+rdi] + movdqu xmm4,XMMWORD[32+rdi] + movdqu xmm5,XMMWORD[48+rdi] + movdqu xmm6,XMMWORD[64+rdi] + movdqu xmm7,XMMWORD[80+rdi] + lea rdi,[96+rdi] + + call __ocb_decrypt6 + + movups XMMWORD[rsi],xmm2 + pxor xmm8,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm8,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm8,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm8,xmm5 + movups XMMWORD[64+rsi],xmm6 + pxor xmm8,xmm6 + movups XMMWORD[80+rsi],xmm7 + pxor xmm8,xmm7 + lea rsi,[96+rsi] + sub rdx,6 + jnc NEAR $L$ocb_dec_grandloop + +$L$ocb_dec_short: + add rdx,6 + jz NEAR $L$ocb_dec_done + + movdqu xmm2,XMMWORD[rdi] + cmp rdx,2 + jb NEAR $L$ocb_dec_one + movdqu xmm3,XMMWORD[16+rdi] + je NEAR $L$ocb_dec_two + + movdqu xmm4,XMMWORD[32+rdi] + cmp rdx,4 + jb NEAR $L$ocb_dec_three + movdqu xmm5,XMMWORD[48+rdi] + je NEAR $L$ocb_dec_four + + movdqu xmm6,XMMWORD[64+rdi] + pxor xmm7,xmm7 + + call __ocb_decrypt6 + + movdqa xmm15,xmm14 + movups XMMWORD[rsi],xmm2 + pxor xmm8,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm8,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm8,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm8,xmm5 + movups XMMWORD[64+rsi],xmm6 + pxor xmm8,xmm6 + + jmp NEAR $L$ocb_dec_done + +ALIGN 16 +$L$ocb_dec_one: + movdqa xmm7,xmm10 + + call __ocb_decrypt1 + + movdqa xmm15,xmm7 + movups XMMWORD[rsi],xmm2 + xorps xmm8,xmm2 + jmp NEAR $L$ocb_dec_done + +ALIGN 16 +$L$ocb_dec_two: + pxor xmm4,xmm4 + pxor xmm5,xmm5 + + call __ocb_decrypt4 + + movdqa xmm15,xmm11 + movups XMMWORD[rsi],xmm2 + xorps xmm8,xmm2 + movups XMMWORD[16+rsi],xmm3 + xorps xmm8,xmm3 + + jmp NEAR $L$ocb_dec_done + +ALIGN 16 +$L$ocb_dec_three: + pxor xmm5,xmm5 + + call __ocb_decrypt4 + + movdqa xmm15,xmm12 + movups XMMWORD[rsi],xmm2 + xorps xmm8,xmm2 + movups XMMWORD[16+rsi],xmm3 + xorps xmm8,xmm3 + movups XMMWORD[32+rsi],xmm4 + xorps xmm8,xmm4 + + jmp NEAR $L$ocb_dec_done + +ALIGN 16 +$L$ocb_dec_four: + call __ocb_decrypt4 + + movdqa xmm15,xmm13 + movups XMMWORD[rsi],xmm2 + pxor xmm8,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm8,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm8,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm8,xmm5 + +$L$ocb_dec_done: + pxor xmm15,xmm0 + movdqu XMMWORD[rbp],xmm8 + movdqu XMMWORD[r9],xmm15 + + xorps xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movaps xmm6,XMMWORD[rsp] + movaps XMMWORD[rsp],xmm0 + movaps xmm7,XMMWORD[16+rsp] + movaps XMMWORD[16+rsp],xmm0 + movaps xmm8,XMMWORD[32+rsp] + movaps XMMWORD[32+rsp],xmm0 + movaps xmm9,XMMWORD[48+rsp] + movaps XMMWORD[48+rsp],xmm0 + movaps xmm10,XMMWORD[64+rsp] + movaps XMMWORD[64+rsp],xmm0 + movaps xmm11,XMMWORD[80+rsp] + movaps XMMWORD[80+rsp],xmm0 + movaps xmm12,XMMWORD[96+rsp] + movaps XMMWORD[96+rsp],xmm0 + movaps xmm13,XMMWORD[112+rsp] + movaps XMMWORD[112+rsp],xmm0 + movaps xmm14,XMMWORD[128+rsp] + movaps XMMWORD[128+rsp],xmm0 + movaps xmm15,XMMWORD[144+rsp] + movaps XMMWORD[144+rsp],xmm0 + lea rax,[((160+40))+rsp] +$L$ocb_dec_pop: + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + lea rsp,[rax] +$L$ocb_dec_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret +$L$SEH_end_aesni_ocb_decrypt: + + +ALIGN 32 +__ocb_decrypt6: + pxor xmm15,xmm9 + movdqu xmm11,XMMWORD[r12*1+rbx] + movdqa xmm12,xmm10 + movdqu xmm13,XMMWORD[r13*1+rbx] + movdqa xmm14,xmm10 + pxor xmm10,xmm15 + movdqu xmm15,XMMWORD[r14*1+rbx] + pxor xmm11,xmm10 + pxor xmm2,xmm10 + pxor xmm12,xmm11 + pxor xmm3,xmm11 + pxor xmm13,xmm12 + pxor xmm4,xmm12 + pxor xmm14,xmm13 + pxor xmm5,xmm13 + pxor xmm15,xmm14 + pxor xmm6,xmm14 + pxor xmm7,xmm15 + movups xmm0,XMMWORD[32+r11] + + lea r12,[1+r8] + lea r13,[3+r8] + lea r14,[5+r8] + add r8,6 + pxor xmm10,xmm9 + bsf r12,r12 + bsf r13,r13 + bsf r14,r14 + +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 + pxor xmm11,xmm9 + pxor xmm12,xmm9 +DB 102,15,56,222,241 + pxor xmm13,xmm9 + pxor xmm14,xmm9 +DB 102,15,56,222,249 + movups xmm1,XMMWORD[48+r11] + pxor xmm15,xmm9 + +DB 102,15,56,222,208 +DB 102,15,56,222,216 +DB 102,15,56,222,224 +DB 102,15,56,222,232 +DB 102,15,56,222,240 +DB 102,15,56,222,248 + movups xmm0,XMMWORD[64+r11] + shl r12,4 + shl r13,4 + jmp NEAR $L$ocb_dec_loop6 + +ALIGN 32 +$L$ocb_dec_loop6: +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 +DB 102,15,56,222,241 +DB 102,15,56,222,249 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,222,208 +DB 102,15,56,222,216 +DB 102,15,56,222,224 +DB 102,15,56,222,232 +DB 102,15,56,222,240 +DB 102,15,56,222,248 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_dec_loop6 + +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 +DB 102,15,56,222,241 +DB 102,15,56,222,249 + movups xmm1,XMMWORD[16+r11] + shl r14,4 + +DB 102,65,15,56,223,210 + movdqu xmm10,XMMWORD[rbx] + mov rax,r10 +DB 102,65,15,56,223,219 +DB 102,65,15,56,223,228 +DB 102,65,15,56,223,237 +DB 102,65,15,56,223,246 +DB 102,65,15,56,223,255 + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ocb_decrypt4: + pxor xmm15,xmm9 + movdqu xmm11,XMMWORD[r12*1+rbx] + movdqa xmm12,xmm10 + movdqu xmm13,XMMWORD[r13*1+rbx] + pxor xmm10,xmm15 + pxor xmm11,xmm10 + pxor xmm2,xmm10 + pxor xmm12,xmm11 + pxor xmm3,xmm11 + pxor xmm13,xmm12 + pxor xmm4,xmm12 + pxor xmm5,xmm13 + movups xmm0,XMMWORD[32+r11] + + pxor xmm10,xmm9 + pxor xmm11,xmm9 + pxor xmm12,xmm9 + pxor xmm13,xmm9 + +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 + movups xmm1,XMMWORD[48+r11] + +DB 102,15,56,222,208 +DB 102,15,56,222,216 +DB 102,15,56,222,224 +DB 102,15,56,222,232 + movups xmm0,XMMWORD[64+r11] + jmp NEAR $L$ocb_dec_loop4 + +ALIGN 32 +$L$ocb_dec_loop4: +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,222,208 +DB 102,15,56,222,216 +DB 102,15,56,222,224 +DB 102,15,56,222,232 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_dec_loop4 + +DB 102,15,56,222,209 +DB 102,15,56,222,217 +DB 102,15,56,222,225 +DB 102,15,56,222,233 + movups xmm1,XMMWORD[16+r11] + mov rax,r10 + +DB 102,65,15,56,223,210 +DB 102,65,15,56,223,219 +DB 102,65,15,56,223,228 +DB 102,65,15,56,223,237 + DB 0F3h,0C3h ;repret + + + +ALIGN 32 +__ocb_decrypt1: + pxor xmm7,xmm15 + pxor xmm7,xmm9 + pxor xmm2,xmm7 + movups xmm0,XMMWORD[32+r11] + +DB 102,15,56,222,209 + movups xmm1,XMMWORD[48+r11] + pxor xmm7,xmm9 + +DB 102,15,56,222,208 + movups xmm0,XMMWORD[64+r11] + jmp NEAR $L$ocb_dec_loop1 + +ALIGN 32 +$L$ocb_dec_loop1: +DB 102,15,56,222,209 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + +DB 102,15,56,222,208 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$ocb_dec_loop1 + +DB 102,15,56,222,209 + movups xmm1,XMMWORD[16+r11] + mov rax,r10 + +DB 102,15,56,223,215 + DB 0F3h,0C3h ;repret + global aesni_cbc_encrypt ALIGN 16 @@ -2837,7 +3723,7 @@ DB 102,15,56,223,209 jmp NEAR $L$cbc_ret ALIGN 16 $L$cbc_decrypt_bulk: - lea rax,[rsp] + lea r11,[rsp] push rbp sub rsp,176 and rsp,-16 @@ -2852,7 +3738,7 @@ $L$cbc_decrypt_bulk: movaps XMMWORD[144+rsp],xmm14 movaps XMMWORD[160+rsp],xmm15 $L$cbc_decrypt_body: - lea rbp,[((-8))+rax] + mov rbp,rcx movups xmm10,XMMWORD[r8] mov eax,r10d cmp rdx,0x50 @@ -2870,7 +3756,8 @@ $L$cbc_decrypt_body: movdqa xmm14,xmm5 movdqu xmm7,XMMWORD[80+rdi] movdqa xmm15,xmm6 - mov r9d,DWORD[((OPENSSL_ia32cap_P+4))] + lea r9,[OPENSSL_ia32cap_P] + mov r9d,DWORD[4+r9] cmp rdx,0x70 jbe NEAR $L$cbc_dec_six_or_seven @@ -2892,7 +3779,7 @@ $L$cbc_dec_loop8_enter: pxor xmm3,xmm0 movups xmm1,XMMWORD[((16-112))+rcx] pxor xmm4,xmm0 - xor r11,r11 + mov rbp,-1 cmp rdx,0x70 pxor xmm5,xmm0 pxor xmm6,xmm0 @@ -2908,10 +3795,10 @@ DB 102,15,56,222,233 DB 102,15,56,222,241 DB 102,15,56,222,249 DB 102,68,15,56,222,193 - setnc r11b - shl r11,7 + adc rbp,0 + and rbp,128 DB 102,68,15,56,222,201 - add r11,rdi + add rbp,rdi movups xmm1,XMMWORD[((48-112))+rcx] DB 102,15,56,222,208 DB 102,15,56,222,216 @@ -3049,18 +3936,18 @@ DB 102,65,15,56,223,219 movdqu xmm0,XMMWORD[112+rdi] DB 102,65,15,56,223,228 lea rdi,[128+rdi] - movdqu xmm11,XMMWORD[r11] + movdqu xmm11,XMMWORD[rbp] DB 102,65,15,56,223,237 DB 102,65,15,56,223,246 - movdqu xmm12,XMMWORD[16+r11] - movdqu xmm13,XMMWORD[32+r11] + movdqu xmm12,XMMWORD[16+rbp] + movdqu xmm13,XMMWORD[32+rbp] DB 102,65,15,56,223,255 DB 102,68,15,56,223,193 - movdqu xmm14,XMMWORD[48+r11] - movdqu xmm15,XMMWORD[64+r11] + movdqu xmm14,XMMWORD[48+rbp] + movdqu xmm15,XMMWORD[64+rbp] DB 102,69,15,56,223,202 movdqa xmm10,xmm0 - movdqu xmm1,XMMWORD[80+r11] + movdqu xmm1,XMMWORD[80+rbp] movups xmm0,XMMWORD[((-112))+rcx] movups XMMWORD[rsi],xmm2 @@ -3179,7 +4066,7 @@ $L$cbc_dec_loop6_enter: pxor xmm5,xmm13 movdqu XMMWORD[32+rsi],xmm4 pxor xmm6,xmm14 - mov rcx,r11 + mov rcx,rbp movdqu XMMWORD[48+rsi],xmm5 pxor xmm7,xmm15 mov eax,r10d @@ -3348,8 +4235,8 @@ $L$cbc_dec_ret: movaps XMMWORD[144+rsp],xmm0 movaps xmm15,XMMWORD[160+rsp] movaps XMMWORD[160+rsp],xmm0 - lea rsp,[rbp] - pop rbp + mov rbp,QWORD[((-8))+r11] + lea rsp,[r11] $L$cbc_ret: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -3407,10 +4294,11 @@ DB 0x48,0x83,0xEC,0x08 test r8,r8 jz NEAR $L$enc_key_ret - mov r10d,268437504 movups xmm0,XMMWORD[rcx] xorps xmm4,xmm4 - and r10d,DWORD[((OPENSSL_ia32cap_P+4))] + lea r10,[OPENSSL_ia32cap_P] + mov r10d,DWORD[4+r10] + and r10d,268437504 lea rax,[16+r8] cmp edx,256 je NEAR $L$14rounds @@ -3865,13 +4753,75 @@ ctr_xts_se_handler: cmp rbx,r10 jae NEAR $L$common_seh_tail - mov rax,QWORD[160+r8] - lea rsi,[((-160))+rax] + mov rax,QWORD[208+r8] + + lea rsi,[((-168))+rax] lea rdi,[512+r8] mov ecx,20 DD 0xa548f3fc - jmp NEAR $L$common_rbp_tail + mov rbp,QWORD[((-8))+rax] + mov QWORD[160+r8],rbp + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +ocb_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$ocb_no_xmm + + mov rax,QWORD[152+r8] + + lea rsi,[rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + lea rax,[((160+40))+rax] + +$L$ocb_no_xmm: + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + + jmp NEAR $L$common_seh_tail ALIGN 16 @@ -3894,9 +4844,13 @@ cbc_se_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail + mov rax,QWORD[120+r8] + lea r10,[$L$cbc_decrypt_body] cmp rbx,r10 - jb NEAR $L$restore_cbc_rax + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] lea r10,[$L$cbc_ret] cmp rbx,r10 @@ -3907,15 +4861,10 @@ cbc_se_handler: mov ecx,20 DD 0xa548f3fc -$L$common_rbp_tail: - mov rax,QWORD[160+r8] - mov rbp,QWORD[rax] - lea rax,[8+rax] - mov QWORD[160+r8],rbp - jmp NEAR $L$common_seh_tail + mov rax,QWORD[208+r8] -$L$restore_cbc_rax: - mov rax,QWORD[120+r8] + mov rbp,QWORD[((-8))+rax] + mov QWORD[160+r8],rbp $L$common_seh_tail: mov rdi,QWORD[8+rax] @@ -3982,6 +4931,14 @@ ALIGN 4 DD $L$SEH_begin_aesni_xts_decrypt wrt ..imagebase DD $L$SEH_end_aesni_xts_decrypt wrt ..imagebase DD $L$SEH_info_xts_dec wrt ..imagebase + + DD $L$SEH_begin_aesni_ocb_encrypt wrt ..imagebase + DD $L$SEH_end_aesni_ocb_encrypt wrt ..imagebase + DD $L$SEH_info_ocb_enc wrt ..imagebase + + DD $L$SEH_begin_aesni_ocb_decrypt wrt ..imagebase + DD $L$SEH_end_aesni_ocb_decrypt wrt ..imagebase + DD $L$SEH_info_ocb_dec wrt ..imagebase DD $L$SEH_begin_aesni_cbc_encrypt wrt ..imagebase DD $L$SEH_end_aesni_cbc_encrypt wrt ..imagebase DD $L$SEH_info_cbc wrt ..imagebase @@ -4019,6 +4976,18 @@ $L$SEH_info_xts_dec: DB 9,0,0,0 DD ctr_xts_se_handler wrt ..imagebase DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase +$L$SEH_info_ocb_enc: +DB 9,0,0,0 + DD ocb_se_handler wrt ..imagebase + DD $L$ocb_enc_body wrt ..imagebase,$L$ocb_enc_epilogue wrt ..imagebase + DD $L$ocb_enc_pop wrt ..imagebase + DD 0 +$L$SEH_info_ocb_dec: +DB 9,0,0,0 + DD ocb_se_handler wrt ..imagebase + DD $L$ocb_dec_body wrt ..imagebase,$L$ocb_dec_epilogue wrt ..imagebase + DD $L$ocb_dec_pop wrt ..imagebase + DD 0 $L$SEH_info_cbc: DB 9,0,0,0 DD cbc_se_handler wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/aes/bsaes-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm similarity index 97% rename from packager/third_party/boringssl/win-x86_64/crypto/aes/bsaes-x86_64.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm index 6d75248d1f..9c6d129369 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/aes/bsaes-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/bsaes-x86_64.asm @@ -1319,7 +1319,7 @@ $L$cbc_dec_bzero: cmp rbp,rax ja NEAR $L$cbc_dec_bzero - lea rsp,[rbp] + lea rax,[120+rbp] movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -1330,15 +1330,15 @@ $L$cbc_dec_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$cbc_dec_tail: + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbx,QWORD[((-16))+rax] + mov rbp,QWORD[((-8))+rax] + lea rsp,[rax] $L$cbc_dec_epilogue: DB 0F3h,0C3h ;repret @@ -1543,7 +1543,7 @@ $L$ctr_enc_bzero: cmp rbp,rax ja NEAR $L$ctr_enc_bzero - lea rsp,[rbp] + lea rax,[120+rbp] movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -1554,15 +1554,15 @@ $L$ctr_enc_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$ctr_enc_tail: + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbx,QWORD[((-16))+rax] + mov rbp,QWORD[((-8))+rax] + lea rsp,[rax] $L$ctr_enc_epilogue: DB 0F3h,0C3h ;repret @@ -2019,7 +2019,7 @@ $L$xts_enc_bzero: cmp rbp,rax ja NEAR $L$xts_enc_bzero - lea rsp,[rbp] + lea rax,[120+rbp] movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -2030,15 +2030,15 @@ $L$xts_enc_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$xts_enc_tail: + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbx,QWORD[((-16))+rax] + mov rbp,QWORD[((-8))+rax] + lea rsp,[rax] $L$xts_enc_epilogue: DB 0F3h,0C3h ;repret @@ -2522,7 +2522,7 @@ $L$xts_dec_bzero: cmp rbp,rax ja NEAR $L$xts_dec_bzero - lea rsp,[rbp] + lea rax,[120+rbp] movaps xmm6,XMMWORD[64+rbp] movaps xmm7,XMMWORD[80+rbp] movaps xmm8,XMMWORD[96+rbp] @@ -2533,15 +2533,15 @@ $L$xts_dec_bzero: movaps xmm13,XMMWORD[176+rbp] movaps xmm14,XMMWORD[192+rbp] movaps xmm15,XMMWORD[208+rbp] - lea rsp,[160+rbp] - mov r15,QWORD[72+rsp] - mov r14,QWORD[80+rsp] - mov r13,QWORD[88+rsp] - mov r12,QWORD[96+rsp] - mov rbx,QWORD[104+rsp] - mov rax,QWORD[112+rsp] - lea rsp,[120+rsp] - mov rbp,rax + lea rax,[160+rax] +$L$xts_dec_tail: + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbx,QWORD[((-16))+rax] + mov rbp,QWORD[((-8))+rax] + lea rsp,[rax] $L$xts_dec_epilogue: DB 0F3h,0C3h ;repret @@ -2628,30 +2628,33 @@ se_handler: mov r10d,DWORD[r11] lea r10,[r10*1+rsi] cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] + jbe NEAR $L$in_prologue mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$in_prologue + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_tail + mov rax,QWORD[160+r8] lea rsi,[64+rax] lea rdi,[512+r8] mov ecx,20 DD 0xa548f3fc - lea rax,[160+rax] + lea rax,[((160+120))+rax] - mov rbp,QWORD[112+rax] - mov rbx,QWORD[104+rax] - mov r12,QWORD[96+rax] - mov r13,QWORD[88+rax] - mov r14,QWORD[80+rax] - mov r15,QWORD[72+rax] - lea rax,[120+rax] +$L$in_tail: + mov rbp,QWORD[((-48))+rax] + mov rbx,QWORD[((-40))+rax] + mov r12,QWORD[((-32))+rax] + mov r13,QWORD[((-24))+rax] + mov r14,QWORD[((-16))+rax] + mov r15,QWORD[((-8))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 @@ -2719,15 +2722,23 @@ $L$cbc_dec_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$cbc_dec_body wrt ..imagebase,$L$cbc_dec_epilogue wrt ..imagebase + DD $L$cbc_dec_tail wrt ..imagebase + DD 0 $L$ctr_enc_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$ctr_enc_body wrt ..imagebase,$L$ctr_enc_epilogue wrt ..imagebase + DD $L$ctr_enc_tail wrt ..imagebase + DD 0 $L$xts_enc_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$xts_enc_body wrt ..imagebase,$L$xts_enc_epilogue wrt ..imagebase + DD $L$xts_enc_tail wrt ..imagebase + DD 0 $L$xts_dec_info: DB 9,0,0,0 DD se_handler wrt ..imagebase DD $L$xts_dec_body wrt ..imagebase,$L$xts_dec_epilogue wrt ..imagebase + DD $L$xts_dec_tail wrt ..imagebase + DD 0 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/modes/ghash-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm similarity index 67% rename from packager/third_party/boringssl/win-x86_64/crypto/modes/ghash-x86_64.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm index 5d8fadc033..8ef16f513d 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/modes/ghash-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/ghash-x86_64.asm @@ -21,6 +21,10 @@ $L$SEH_begin_gcm_gmult_4bit: push rbx push rbp push r12 + push r13 + push r14 + push r15 + sub rsp,280 $L$gmult_prologue: movzx r8,BYTE[15+rdi] @@ -97,8 +101,9 @@ $L$break1: mov QWORD[8+rdi],r8 mov QWORD[rdi],r9 - mov rbx,QWORD[16+rsp] - lea rsp,[24+rsp] + lea rsi,[((280+48))+rsp] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$gmult_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -669,14 +674,14 @@ $L$outer_loop: mov QWORD[8+rdi],r8 mov QWORD[rdi],r9 - lea rsi,[280+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + lea rsi,[((280+48))+rsp] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$ghash_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -926,7 +931,8 @@ DB 102,65,15,56,0,194 jz NEAR $L$odd_tail movdqu xmm6,XMMWORD[16+rdx] - mov eax,DWORD[((OPENSSL_ia32cap_P+4))] + lea rax,[OPENSSL_ia32cap_P] + mov eax,DWORD[4+rax] cmp r9,0x30 jb NEAR $L$skip4x @@ -1309,7 +1315,115 @@ global gcm_init_avx ALIGN 32 gcm_init_avx: - jmp NEAR $L$_init_clmul +$L$SEH_begin_gcm_init_avx: + +DB 0x48,0x83,0xec,0x18 +DB 0x0f,0x29,0x34,0x24 + vzeroupper + + vmovdqu xmm2,XMMWORD[rdx] + vpshufd xmm2,xmm2,78 + + + vpshufd xmm4,xmm2,255 + vpsrlq xmm3,xmm2,63 + vpsllq xmm2,xmm2,1 + vpxor xmm5,xmm5,xmm5 + vpcmpgtd xmm5,xmm5,xmm4 + vpslldq xmm3,xmm3,8 + vpor xmm2,xmm2,xmm3 + + + vpand xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial] + vpxor xmm2,xmm2,xmm5 + + vpunpckhqdq xmm6,xmm2,xmm2 + vmovdqa xmm0,xmm2 + vpxor xmm6,xmm6,xmm2 + mov r10,4 + jmp NEAR $L$init_start_avx +ALIGN 32 +$L$init_loop_avx: + vpalignr xmm5,xmm4,xmm3,8 + vmovdqu XMMWORD[(-16)+rcx],xmm5 + vpunpckhqdq xmm3,xmm0,xmm0 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm1,xmm0,xmm2,0x11 + vpclmulqdq xmm0,xmm0,xmm2,0x00 + vpclmulqdq xmm3,xmm3,xmm6,0x00 + vpxor xmm4,xmm1,xmm0 + vpxor xmm3,xmm3,xmm4 + + vpslldq xmm4,xmm3,8 + vpsrldq xmm3,xmm3,8 + vpxor xmm0,xmm0,xmm4 + vpxor xmm1,xmm1,xmm3 + vpsllq xmm3,xmm0,57 + vpsllq xmm4,xmm0,62 + vpxor xmm4,xmm4,xmm3 + vpsllq xmm3,xmm0,63 + vpxor xmm4,xmm4,xmm3 + vpslldq xmm3,xmm4,8 + vpsrldq xmm4,xmm4,8 + vpxor xmm0,xmm0,xmm3 + vpxor xmm1,xmm1,xmm4 + + vpsrlq xmm4,xmm0,1 + vpxor xmm1,xmm1,xmm0 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm4,xmm4,5 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm0,xmm0,1 + vpxor xmm0,xmm0,xmm1 +$L$init_start_avx: + vmovdqa xmm5,xmm0 + vpunpckhqdq xmm3,xmm0,xmm0 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm1,xmm0,xmm2,0x11 + vpclmulqdq xmm0,xmm0,xmm2,0x00 + vpclmulqdq xmm3,xmm3,xmm6,0x00 + vpxor xmm4,xmm1,xmm0 + vpxor xmm3,xmm3,xmm4 + + vpslldq xmm4,xmm3,8 + vpsrldq xmm3,xmm3,8 + vpxor xmm0,xmm0,xmm4 + vpxor xmm1,xmm1,xmm3 + vpsllq xmm3,xmm0,57 + vpsllq xmm4,xmm0,62 + vpxor xmm4,xmm4,xmm3 + vpsllq xmm3,xmm0,63 + vpxor xmm4,xmm4,xmm3 + vpslldq xmm3,xmm4,8 + vpsrldq xmm4,xmm4,8 + vpxor xmm0,xmm0,xmm3 + vpxor xmm1,xmm1,xmm4 + + vpsrlq xmm4,xmm0,1 + vpxor xmm1,xmm1,xmm0 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm4,xmm4,5 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm0,xmm0,1 + vpxor xmm0,xmm0,xmm1 + vpshufd xmm3,xmm5,78 + vpshufd xmm4,xmm0,78 + vpxor xmm3,xmm3,xmm5 + vmovdqu XMMWORD[rcx],xmm5 + vpxor xmm4,xmm4,xmm0 + vmovdqu XMMWORD[16+rcx],xmm0 + lea rcx,[48+rcx] + sub r10,1 + jnz NEAR $L$init_loop_avx + + vpalignr xmm5,xmm3,xmm4,8 + vmovdqu XMMWORD[(-16)+rcx],xmm5 + + vzeroupper + movaps xmm6,XMMWORD[rsp] + lea rsp,[24+rsp] +$L$SEH_end_gcm_init_avx: + DB 0F3h,0C3h ;repret global gcm_gmult_avx @@ -1321,7 +1435,403 @@ global gcm_ghash_avx ALIGN 32 gcm_ghash_avx: - jmp NEAR $L$_ghash_clmul + lea rax,[((-136))+rsp] +$L$SEH_begin_gcm_ghash_avx: + +DB 0x48,0x8d,0x60,0xe0 +DB 0x0f,0x29,0x70,0xe0 +DB 0x0f,0x29,0x78,0xf0 +DB 0x44,0x0f,0x29,0x00 +DB 0x44,0x0f,0x29,0x48,0x10 +DB 0x44,0x0f,0x29,0x50,0x20 +DB 0x44,0x0f,0x29,0x58,0x30 +DB 0x44,0x0f,0x29,0x60,0x40 +DB 0x44,0x0f,0x29,0x68,0x50 +DB 0x44,0x0f,0x29,0x70,0x60 +DB 0x44,0x0f,0x29,0x78,0x70 + vzeroupper + + vmovdqu xmm10,XMMWORD[rcx] + lea r10,[$L$0x1c2_polynomial] + lea rdx,[64+rdx] + vmovdqu xmm13,XMMWORD[$L$bswap_mask] + vpshufb xmm10,xmm10,xmm13 + cmp r9,0x80 + jb NEAR $L$short_avx + sub r9,0x80 + + vmovdqu xmm14,XMMWORD[112+r8] + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vpshufb xmm14,xmm14,xmm13 + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + + vpunpckhqdq xmm9,xmm14,xmm14 + vmovdqu xmm15,XMMWORD[96+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm9,xmm9,xmm14 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vmovdqu xmm14,XMMWORD[80+r8] + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vmovdqu xmm15,XMMWORD[64+r8] + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + + vpshufb xmm15,xmm15,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[48+r8] + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpxor xmm2,xmm2,xmm5 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[32+r8] + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[16+r8] + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpxor xmm2,xmm2,xmm5 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((176-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[r8] + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((160-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm9,xmm7,0x10 + + lea r8,[128+r8] + cmp r9,0x80 + jb NEAR $L$tail_avx + + vpxor xmm15,xmm15,xmm10 + sub r9,0x80 + jmp NEAR $L$oop8x_avx + +ALIGN 32 +$L$oop8x_avx: + vpunpckhqdq xmm8,xmm15,xmm15 + vmovdqu xmm14,XMMWORD[112+r8] + vpxor xmm3,xmm3,xmm0 + vpxor xmm8,xmm8,xmm15 + vpclmulqdq xmm10,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm11,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm12,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[96+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm10,xmm10,xmm3 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vxorps xmm11,xmm11,xmm4 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm12,xmm12,xmm5 + vxorps xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[80+r8] + vpxor xmm12,xmm12,xmm10 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm12,xmm12,xmm11 + vpslldq xmm9,xmm12,8 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vpsrldq xmm12,xmm12,8 + vpxor xmm10,xmm10,xmm9 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpshufb xmm14,xmm14,xmm13 + vxorps xmm11,xmm11,xmm12 + vpxor xmm4,xmm4,xmm1 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[64+r8] + vpalignr xmm12,xmm10,xmm10,8 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vxorps xmm8,xmm8,xmm15 + vpxor xmm2,xmm2,xmm5 + + vmovdqu xmm14,XMMWORD[48+r8] + vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[32+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + vpxor xmm2,xmm2,xmm5 + vxorps xmm10,xmm10,xmm12 + + vmovdqu xmm14,XMMWORD[16+r8] + vpalignr xmm12,xmm10,xmm10,8 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 + vxorps xmm12,xmm12,xmm11 + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((176-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((160-64))+rdx] + vpxor xmm15,xmm15,xmm12 + vpclmulqdq xmm2,xmm9,xmm7,0x10 + vpxor xmm15,xmm15,xmm10 + + lea r8,[128+r8] + sub r9,0x80 + jnc NEAR $L$oop8x_avx + + add r9,0x80 + jmp NEAR $L$tail_no_xor_avx + +ALIGN 32 +$L$short_avx: + vmovdqu xmm14,XMMWORD[((-16))+r9*1+r8] + lea r8,[r9*1+r8] + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + + vmovdqa xmm3,xmm0 + vmovdqa xmm4,xmm1 + vmovdqa xmm5,xmm2 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-32))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-48))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-64))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-80))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-96))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-112))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovq xmm7,QWORD[((184-64))+rdx] + sub r9,0x10 + jmp NEAR $L$tail_avx + +ALIGN 32 +$L$tail_avx: + vpxor xmm15,xmm15,xmm10 +$L$tail_no_xor_avx: + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + + vmovdqu xmm12,XMMWORD[r10] + + vpxor xmm10,xmm3,xmm0 + vpxor xmm11,xmm4,xmm1 + vpxor xmm5,xmm5,xmm2 + + vpxor xmm5,xmm5,xmm10 + vpxor xmm5,xmm5,xmm11 + vpslldq xmm9,xmm5,8 + vpsrldq xmm5,xmm5,8 + vpxor xmm10,xmm10,xmm9 + vpxor xmm11,xmm11,xmm5 + + vpclmulqdq xmm9,xmm10,xmm12,0x10 + vpalignr xmm10,xmm10,xmm10,8 + vpxor xmm10,xmm10,xmm9 + + vpclmulqdq xmm9,xmm10,xmm12,0x10 + vpalignr xmm10,xmm10,xmm10,8 + vpxor xmm10,xmm10,xmm11 + vpxor xmm10,xmm10,xmm9 + + cmp r9,0 + jne NEAR $L$short_avx + + vpshufb xmm10,xmm10,xmm13 + vmovdqu XMMWORD[rcx],xmm10 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] +$L$SEH_end_gcm_ghash_avx: + DB 0F3h,0C3h ;repret ALIGN 64 $L$bswap_mask: @@ -1412,14 +1922,20 @@ se_handler: cmp rbx,r10 jae NEAR $L$in_prologue - lea rax,[24+rax] + lea rax,[((48+280))+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] mov QWORD[144+r8],rbx mov QWORD[160+r8],rbp mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 $L$in_prologue: mov rdi,QWORD[8+rax] @@ -1478,6 +1994,13 @@ ALIGN 4 DD $L$SEH_begin_gcm_ghash_clmul wrt ..imagebase DD $L$SEH_end_gcm_ghash_clmul wrt ..imagebase DD $L$SEH_info_gcm_ghash_clmul wrt ..imagebase + DD $L$SEH_begin_gcm_init_avx wrt ..imagebase + DD $L$SEH_end_gcm_init_avx wrt ..imagebase + DD $L$SEH_info_gcm_init_clmul wrt ..imagebase + + DD $L$SEH_begin_gcm_ghash_avx wrt ..imagebase + DD $L$SEH_end_gcm_ghash_avx wrt ..imagebase + DD $L$SEH_info_gcm_ghash_clmul wrt ..imagebase section .xdata rdata align=8 ALIGN 8 $L$SEH_info_gcm_gmult_4bit: diff --git a/packager/third_party/boringssl/win-x86_64/crypto/md5/md5-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/md5-x86_64.asm similarity index 100% rename from packager/third_party/boringssl/win-x86_64/crypto/md5/md5-x86_64.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/md5-x86_64.asm diff --git a/packager/third_party/boringssl/win-x86_64/crypto/ec/p256-x86_64-asm.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm similarity index 87% rename from packager/third_party/boringssl/win-x86_64/crypto/ec/p256-x86_64-asm.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm index a2e4075819..64db9d9518 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/ec/p256-x86_64-asm.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm @@ -21,57 +21,6 @@ $L$ONE_mont: DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe -ALIGN 64 -ecp_nistz256_mul_by_2: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_mul_by_2: - mov rdi,rcx - mov rsi,rdx - - - push r12 - push r13 - - mov r8,QWORD[rsi] - mov r9,QWORD[8+rsi] - add r8,r8 - mov r10,QWORD[16+rsi] - adc r9,r9 - mov r11,QWORD[24+rsi] - lea rsi,[$L$poly] - mov rax,r8 - adc r10,r10 - adc r11,r11 - mov rdx,r9 - sbb r13,r13 - - sub r8,QWORD[rsi] - mov rcx,r10 - sbb r9,QWORD[8+rsi] - sbb r10,QWORD[16+rsi] - mov r12,r11 - sbb r11,QWORD[24+rsi] - test r13,r13 - - cmovz r8,rax - cmovz r9,rdx - mov QWORD[rdi],r8 - cmovz r10,rcx - mov QWORD[8+rdi],r9 - cmovz r11,r12 - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - - pop r13 - pop r12 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_mul_by_2: - - global ecp_nistz256_neg @@ -594,114 +543,14 @@ __ecp_nistz256_sqr_montq: - - - - -global ecp_nistz256_from_mont - -ALIGN 32 -ecp_nistz256_from_mont: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_ecp_nistz256_from_mont: - mov rdi,rcx - mov rsi,rdx - - - push r12 - push r13 - - mov rax,QWORD[rsi] - mov r13,QWORD[(($L$poly+24))] - mov r9,QWORD[8+rsi] - mov r10,QWORD[16+rsi] - mov r11,QWORD[24+rsi] - mov r8,rax - mov r12,QWORD[(($L$poly+8))] - - - - mov rcx,rax - shl r8,32 - mul r13 - shr rcx,32 - add r9,r8 - adc r10,rcx - adc r11,rax - mov rax,r9 - adc rdx,0 - - - - mov rcx,r9 - shl r9,32 - mov r8,rdx - mul r13 - shr rcx,32 - add r10,r9 - adc r11,rcx - adc r8,rax - mov rax,r10 - adc rdx,0 - - - - mov rcx,r10 - shl r10,32 - mov r9,rdx - mul r13 - shr rcx,32 - add r11,r10 - adc r8,rcx - adc r9,rax - mov rax,r11 - adc rdx,0 - - - - mov rcx,r11 - shl r11,32 - mov r10,rdx - mul r13 - shr rcx,32 - add r8,r11 - adc r9,rcx - mov rcx,r8 - adc r10,rax - mov rsi,r9 - adc rdx,0 - - sub r8,-1 - mov rax,r10 - sbb r9,r12 - sbb r10,0 - mov r11,rdx - sbb rdx,r13 - sbb r13,r13 - - cmovnz r8,rcx - cmovnz r9,rsi - mov QWORD[rdi],r8 - cmovnz r10,rax - mov QWORD[8+rdi],r9 - cmovz r11,rdx - mov QWORD[16+rdi],r10 - mov QWORD[24+rdi],r11 - - pop r13 - pop r12 - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_ecp_nistz256_from_mont: - - global ecp_nistz256_select_w5 ALIGN 32 ecp_nistz256_select_w5: + lea rax,[OPENSSL_ia32cap_P] + mov rax,QWORD[8+rax] + test eax,32 + jnz NEAR $L$avx2_select_w5 lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_select_w5: DB 0x48,0x8d,0x60,0xe0 @@ -786,6 +635,10 @@ global ecp_nistz256_select_w7 ALIGN 32 ecp_nistz256_select_w7: + lea rax,[OPENSSL_ia32cap_P] + mov rax,QWORD[8+rax] + test eax,32 + jnz NEAR $L$avx2_select_w7 lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_select_w7: DB 0x48,0x8d,0x60,0xe0 @@ -852,34 +705,204 @@ $L$select_loop_sse_w7: $L$SEH_end_ecp_nistz256_select_w7: DB 0F3h,0C3h ;repret + + + +ALIGN 32 +ecp_nistz256_avx2_select_w5: +$L$avx2_select_w5: + vzeroupper + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_avx2_select_w5: +DB 0x48,0x8d,0x60,0xe0 +DB 0xc5,0xf8,0x29,0x70,0xe0 +DB 0xc5,0xf8,0x29,0x78,0xf0 +DB 0xc5,0x78,0x29,0x40,0x00 +DB 0xc5,0x78,0x29,0x48,0x10 +DB 0xc5,0x78,0x29,0x50,0x20 +DB 0xc5,0x78,0x29,0x58,0x30 +DB 0xc5,0x78,0x29,0x60,0x40 +DB 0xc5,0x78,0x29,0x68,0x50 +DB 0xc5,0x78,0x29,0x70,0x60 +DB 0xc5,0x78,0x29,0x78,0x70 + vmovdqa ymm0,YMMWORD[$L$Two] + + vpxor ymm2,ymm2,ymm2 + vpxor ymm3,ymm3,ymm3 + vpxor ymm4,ymm4,ymm4 + + vmovdqa ymm5,YMMWORD[$L$One] + vmovdqa ymm10,YMMWORD[$L$Two] + + vmovd xmm1,r8d + vpermd ymm1,ymm2,ymm1 + + mov rax,8 +$L$select_loop_avx2_w5: + + vmovdqa ymm6,YMMWORD[rdx] + vmovdqa ymm7,YMMWORD[32+rdx] + vmovdqa ymm8,YMMWORD[64+rdx] + + vmovdqa ymm11,YMMWORD[96+rdx] + vmovdqa ymm12,YMMWORD[128+rdx] + vmovdqa ymm13,YMMWORD[160+rdx] + + vpcmpeqd ymm9,ymm5,ymm1 + vpcmpeqd ymm14,ymm10,ymm1 + + vpaddd ymm5,ymm5,ymm0 + vpaddd ymm10,ymm10,ymm0 + lea rdx,[192+rdx] + + vpand ymm6,ymm6,ymm9 + vpand ymm7,ymm7,ymm9 + vpand ymm8,ymm8,ymm9 + vpand ymm11,ymm11,ymm14 + vpand ymm12,ymm12,ymm14 + vpand ymm13,ymm13,ymm14 + + vpxor ymm2,ymm2,ymm6 + vpxor ymm3,ymm3,ymm7 + vpxor ymm4,ymm4,ymm8 + vpxor ymm2,ymm2,ymm11 + vpxor ymm3,ymm3,ymm12 + vpxor ymm4,ymm4,ymm13 + + dec rax + jnz NEAR $L$select_loop_avx2_w5 + + vmovdqu YMMWORD[rcx],ymm2 + vmovdqu YMMWORD[32+rcx],ymm3 + vmovdqu YMMWORD[64+rcx],ymm4 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] +$L$SEH_end_ecp_nistz256_avx2_select_w5: + DB 0F3h,0C3h ;repret + + + + global ecp_nistz256_avx2_select_w7 ALIGN 32 ecp_nistz256_avx2_select_w7: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp +$L$avx2_select_w7: + vzeroupper + lea rax,[((-136))+rsp] $L$SEH_begin_ecp_nistz256_avx2_select_w7: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 +DB 0x48,0x8d,0x60,0xe0 +DB 0xc5,0xf8,0x29,0x70,0xe0 +DB 0xc5,0xf8,0x29,0x78,0xf0 +DB 0xc5,0x78,0x29,0x40,0x00 +DB 0xc5,0x78,0x29,0x48,0x10 +DB 0xc5,0x78,0x29,0x50,0x20 +DB 0xc5,0x78,0x29,0x58,0x30 +DB 0xc5,0x78,0x29,0x60,0x40 +DB 0xc5,0x78,0x29,0x68,0x50 +DB 0xc5,0x78,0x29,0x70,0x60 +DB 0xc5,0x78,0x29,0x78,0x70 + vmovdqa ymm0,YMMWORD[$L$Three] + + vpxor ymm2,ymm2,ymm2 + vpxor ymm3,ymm3,ymm3 + + vmovdqa ymm4,YMMWORD[$L$One] + vmovdqa ymm8,YMMWORD[$L$Two] + vmovdqa ymm12,YMMWORD[$L$Three] + + vmovd xmm1,r8d + vpermd ymm1,ymm2,ymm1 -DB 0x0f,0x0b - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret + mov rax,21 +$L$select_loop_avx2_w7: + + vmovdqa ymm5,YMMWORD[rdx] + vmovdqa ymm6,YMMWORD[32+rdx] + + vmovdqa ymm9,YMMWORD[64+rdx] + vmovdqa ymm10,YMMWORD[96+rdx] + + vmovdqa ymm13,YMMWORD[128+rdx] + vmovdqa ymm14,YMMWORD[160+rdx] + + vpcmpeqd ymm7,ymm4,ymm1 + vpcmpeqd ymm11,ymm8,ymm1 + vpcmpeqd ymm15,ymm12,ymm1 + + vpaddd ymm4,ymm4,ymm0 + vpaddd ymm8,ymm8,ymm0 + vpaddd ymm12,ymm12,ymm0 + lea rdx,[192+rdx] + + vpand ymm5,ymm5,ymm7 + vpand ymm6,ymm6,ymm7 + vpand ymm9,ymm9,ymm11 + vpand ymm10,ymm10,ymm11 + vpand ymm13,ymm13,ymm15 + vpand ymm14,ymm14,ymm15 + + vpxor ymm2,ymm2,ymm5 + vpxor ymm3,ymm3,ymm6 + vpxor ymm2,ymm2,ymm9 + vpxor ymm3,ymm3,ymm10 + vpxor ymm2,ymm2,ymm13 + vpxor ymm3,ymm3,ymm14 + + dec rax + jnz NEAR $L$select_loop_avx2_w7 + + + vmovdqa ymm5,YMMWORD[rdx] + vmovdqa ymm6,YMMWORD[32+rdx] + + vpcmpeqd ymm7,ymm4,ymm1 + + vpand ymm5,ymm5,ymm7 + vpand ymm6,ymm6,ymm7 + + vpxor ymm2,ymm2,ymm5 + vpxor ymm3,ymm3,ymm6 + + vmovdqu YMMWORD[rcx],ymm2 + vmovdqu YMMWORD[32+rcx],ymm3 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] $L$SEH_end_ecp_nistz256_avx2_select_w7: + DB 0F3h,0C3h ;repret + ALIGN 32 __ecp_nistz256_add_toq: + xor r11,r11 add r12,QWORD[rbx] adc r13,QWORD[8+rbx] mov rax,r12 adc r8,QWORD[16+rbx] adc r9,QWORD[24+rbx] mov rbp,r13 - sbb r11,r11 + adc r11,0 sub r12,-1 mov rcx,r8 @@ -887,14 +910,14 @@ __ecp_nistz256_add_toq: sbb r8,0 mov r10,r9 sbb r9,r15 - test r11,r11 + sbb r11,0 - cmovz r12,rax - cmovz r13,rbp + cmovc r12,rax + cmovc r13,rbp mov QWORD[rdi],r12 - cmovz r8,rcx + cmovc r8,rcx mov QWORD[8+rdi],r13 - cmovz r9,r10 + cmovc r9,r10 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 @@ -962,13 +985,14 @@ __ecp_nistz256_subq: ALIGN 32 __ecp_nistz256_mul_by_2q: + xor r11,r11 add r12,r12 adc r13,r13 mov rax,r12 adc r8,r8 adc r9,r9 mov rbp,r13 - sbb r11,r11 + adc r11,0 sub r12,-1 mov rcx,r8 @@ -976,14 +1000,14 @@ __ecp_nistz256_mul_by_2q: sbb r8,0 mov r10,r9 sbb r9,r15 - test r11,r11 + sbb r11,0 - cmovz r12,rax - cmovz r13,rbp + cmovc r12,rax + cmovc r13,rbp mov QWORD[rdi],r12 - cmovz r8,rcx + cmovc r8,rcx mov QWORD[8+rdi],r13 - cmovz r9,r10 + cmovc r9,r10 mov QWORD[16+rdi],r8 mov QWORD[24+rdi],r9 @@ -1232,16 +1256,14 @@ $L$SEH_begin_ecp_nistz256_point_add: mov rsi,rdx movdqa XMMWORD[384+rsp],xmm0 movdqa XMMWORD[(384+16)+rsp],xmm1 - por xmm1,xmm0 movdqa XMMWORD[416+rsp],xmm2 movdqa XMMWORD[(416+16)+rsp],xmm3 - por xmm3,xmm2 movdqa XMMWORD[448+rsp],xmm4 movdqa XMMWORD[(448+16)+rsp],xmm5 - por xmm3,xmm1 + por xmm5,xmm4 movdqu xmm0,XMMWORD[rsi] - pshufd xmm5,xmm3,0xb1 + pshufd xmm3,xmm5,0xb1 movdqu xmm1,XMMWORD[16+rsi] movdqu xmm2,XMMWORD[32+rsi] por xmm5,xmm3 @@ -1253,14 +1275,14 @@ $L$SEH_begin_ecp_nistz256_point_add: movdqa XMMWORD[480+rsp],xmm0 pshufd xmm4,xmm5,0x1e movdqa XMMWORD[(480+16)+rsp],xmm1 - por xmm1,xmm0 -DB 102,72,15,110,199 + movdqu xmm0,XMMWORD[64+rsi] + movdqu xmm1,XMMWORD[80+rsi] movdqa XMMWORD[512+rsp],xmm2 movdqa XMMWORD[(512+16)+rsp],xmm3 - por xmm3,xmm2 por xmm5,xmm4 pxor xmm4,xmm4 - por xmm3,xmm1 + por xmm1,xmm0 +DB 102,72,15,110,199 lea rsi,[((64-0))+rsi] mov QWORD[((544+0))+rsp],rax @@ -1271,8 +1293,8 @@ DB 102,72,15,110,199 call __ecp_nistz256_sqr_montq pcmpeqd xmm5,xmm4 - pshufd xmm4,xmm3,0xb1 - por xmm4,xmm3 + pshufd xmm4,xmm1,0xb1 + por xmm4,xmm1 pshufd xmm5,xmm5,0 pshufd xmm3,xmm4,0x1e por xmm4,xmm3 @@ -1455,6 +1477,7 @@ $L$add_proceedq: + xor r11,r11 add r12,r12 lea rsi,[96+rsp] adc r13,r13 @@ -1462,7 +1485,7 @@ $L$add_proceedq: adc r8,r8 adc r9,r9 mov rbp,r13 - sbb r11,r11 + adc r11,0 sub r12,-1 mov rcx,r8 @@ -1470,15 +1493,15 @@ $L$add_proceedq: sbb r8,0 mov r10,r9 sbb r9,r15 - test r11,r11 + sbb r11,0 - cmovz r12,rax + cmovc r12,rax mov rax,QWORD[rsi] - cmovz r13,rbp + cmovc r13,rbp mov rbp,QWORD[8+rsi] - cmovz r8,rcx + cmovc r8,rcx mov rcx,QWORD[16+rsi] - cmovz r9,r10 + cmovc r9,r10 mov r10,QWORD[24+rsi] call __ecp_nistz256_subq @@ -1643,16 +1666,14 @@ $L$SEH_begin_ecp_nistz256_point_add_affine: mov r8,QWORD[((64+24))+rsi] movdqa XMMWORD[320+rsp],xmm0 movdqa XMMWORD[(320+16)+rsp],xmm1 - por xmm1,xmm0 movdqa XMMWORD[352+rsp],xmm2 movdqa XMMWORD[(352+16)+rsp],xmm3 - por xmm3,xmm2 movdqa XMMWORD[384+rsp],xmm4 movdqa XMMWORD[(384+16)+rsp],xmm5 - por xmm3,xmm1 + por xmm5,xmm4 movdqu xmm0,XMMWORD[rbx] - pshufd xmm5,xmm3,0xb1 + pshufd xmm3,xmm5,0xb1 movdqu xmm1,XMMWORD[16+rbx] movdqu xmm2,XMMWORD[32+rbx] por xmm5,xmm3 @@ -1770,6 +1791,7 @@ DB 102,72,15,110,199 + xor r11,r11 add r12,r12 lea rsi,[192+rsp] adc r13,r13 @@ -1777,7 +1799,7 @@ DB 102,72,15,110,199 adc r8,r8 adc r9,r9 mov rbp,r13 - sbb r11,r11 + adc r11,0 sub r12,-1 mov rcx,r8 @@ -1785,15 +1807,15 @@ DB 102,72,15,110,199 sbb r8,0 mov r10,r9 sbb r9,r15 - test r11,r11 + sbb r11,0 - cmovz r12,rax + cmovc r12,rax mov rax,QWORD[rsi] - cmovz r13,rbp + cmovc r13,rbp mov rbp,QWORD[8+rsi] - cmovz r8,rcx + cmovc r8,rcx mov rcx,QWORD[16+rsi] - cmovz r9,r10 + cmovc r9,r10 mov r10,QWORD[24+rsi] call __ecp_nistz256_subq diff --git a/packager/third_party/boringssl/win-x86_64/crypto/rand/rdrand-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm similarity index 100% rename from packager/third_party/boringssl/win-x86_64/crypto/rand/rdrand-x86_64.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm diff --git a/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm new file mode 100644 index 0000000000..a06e6f6cd6 --- /dev/null +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/rsaz-avx2.asm @@ -0,0 +1,1970 @@ +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +section .text code align=64 + + +global rsaz_1024_sqr_avx2 + +ALIGN 64 +rsaz_1024_sqr_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_rsaz_1024_sqr_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + lea rax,[rsp] + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + vzeroupper + lea rsp,[((-168))+rsp] + vmovaps XMMWORD[(-216)+rax],xmm6 + vmovaps XMMWORD[(-200)+rax],xmm7 + vmovaps XMMWORD[(-184)+rax],xmm8 + vmovaps XMMWORD[(-168)+rax],xmm9 + vmovaps XMMWORD[(-152)+rax],xmm10 + vmovaps XMMWORD[(-136)+rax],xmm11 + vmovaps XMMWORD[(-120)+rax],xmm12 + vmovaps XMMWORD[(-104)+rax],xmm13 + vmovaps XMMWORD[(-88)+rax],xmm14 + vmovaps XMMWORD[(-72)+rax],xmm15 +$L$sqr_1024_body: + mov rbp,rax + + mov r13,rdx + sub rsp,832 + mov r15,r13 + sub rdi,-128 + sub rsi,-128 + sub r13,-128 + + and r15,4095 + add r15,32*10 + shr r15,12 + vpxor ymm9,ymm9,ymm9 + jz NEAR $L$sqr_1024_no_n_copy + + + + + + sub rsp,32*10 + vmovdqu ymm0,YMMWORD[((0-128))+r13] + and rsp,-2048 + vmovdqu ymm1,YMMWORD[((32-128))+r13] + vmovdqu ymm2,YMMWORD[((64-128))+r13] + vmovdqu ymm3,YMMWORD[((96-128))+r13] + vmovdqu ymm4,YMMWORD[((128-128))+r13] + vmovdqu ymm5,YMMWORD[((160-128))+r13] + vmovdqu ymm6,YMMWORD[((192-128))+r13] + vmovdqu ymm7,YMMWORD[((224-128))+r13] + vmovdqu ymm8,YMMWORD[((256-128))+r13] + lea r13,[((832+128))+rsp] + vmovdqu YMMWORD[(0-128)+r13],ymm0 + vmovdqu YMMWORD[(32-128)+r13],ymm1 + vmovdqu YMMWORD[(64-128)+r13],ymm2 + vmovdqu YMMWORD[(96-128)+r13],ymm3 + vmovdqu YMMWORD[(128-128)+r13],ymm4 + vmovdqu YMMWORD[(160-128)+r13],ymm5 + vmovdqu YMMWORD[(192-128)+r13],ymm6 + vmovdqu YMMWORD[(224-128)+r13],ymm7 + vmovdqu YMMWORD[(256-128)+r13],ymm8 + vmovdqu YMMWORD[(288-128)+r13],ymm9 + +$L$sqr_1024_no_n_copy: + and rsp,-1024 + + vmovdqu ymm1,YMMWORD[((32-128))+rsi] + vmovdqu ymm2,YMMWORD[((64-128))+rsi] + vmovdqu ymm3,YMMWORD[((96-128))+rsi] + vmovdqu ymm4,YMMWORD[((128-128))+rsi] + vmovdqu ymm5,YMMWORD[((160-128))+rsi] + vmovdqu ymm6,YMMWORD[((192-128))+rsi] + vmovdqu ymm7,YMMWORD[((224-128))+rsi] + vmovdqu ymm8,YMMWORD[((256-128))+rsi] + + lea rbx,[192+rsp] + vpbroadcastq ymm15,QWORD[$L$and_mask] + jmp NEAR $L$OOP_GRANDE_SQR_1024 + +ALIGN 32 +$L$OOP_GRANDE_SQR_1024: + lea r9,[((576+128))+rsp] + lea r12,[448+rsp] + + + + + vpaddq ymm1,ymm1,ymm1 + vpbroadcastq ymm10,QWORD[((0-128))+rsi] + vpaddq ymm2,ymm2,ymm2 + vmovdqa YMMWORD[(0-128)+r9],ymm1 + vpaddq ymm3,ymm3,ymm3 + vmovdqa YMMWORD[(32-128)+r9],ymm2 + vpaddq ymm4,ymm4,ymm4 + vmovdqa YMMWORD[(64-128)+r9],ymm3 + vpaddq ymm5,ymm5,ymm5 + vmovdqa YMMWORD[(96-128)+r9],ymm4 + vpaddq ymm6,ymm6,ymm6 + vmovdqa YMMWORD[(128-128)+r9],ymm5 + vpaddq ymm7,ymm7,ymm7 + vmovdqa YMMWORD[(160-128)+r9],ymm6 + vpaddq ymm8,ymm8,ymm8 + vmovdqa YMMWORD[(192-128)+r9],ymm7 + vpxor ymm9,ymm9,ymm9 + vmovdqa YMMWORD[(224-128)+r9],ymm8 + + vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi] + vpbroadcastq ymm11,QWORD[((32-128))+rsi] + vmovdqu YMMWORD[(288-192)+rbx],ymm9 + vpmuludq ymm1,ymm1,ymm10 + vmovdqu YMMWORD[(320-448)+r12],ymm9 + vpmuludq ymm2,ymm2,ymm10 + vmovdqu YMMWORD[(352-448)+r12],ymm9 + vpmuludq ymm3,ymm3,ymm10 + vmovdqu YMMWORD[(384-448)+r12],ymm9 + vpmuludq ymm4,ymm4,ymm10 + vmovdqu YMMWORD[(416-448)+r12],ymm9 + vpmuludq ymm5,ymm5,ymm10 + vmovdqu YMMWORD[(448-448)+r12],ymm9 + vpmuludq ymm6,ymm6,ymm10 + vmovdqu YMMWORD[(480-448)+r12],ymm9 + vpmuludq ymm7,ymm7,ymm10 + vmovdqu YMMWORD[(512-448)+r12],ymm9 + vpmuludq ymm8,ymm8,ymm10 + vpbroadcastq ymm10,QWORD[((64-128))+rsi] + vmovdqu YMMWORD[(544-448)+r12],ymm9 + + mov r15,rsi + mov r14d,4 + jmp NEAR $L$sqr_entry_1024 +ALIGN 32 +$L$OOP_SQR_1024: + vpbroadcastq ymm11,QWORD[((32-128))+r15] + vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi] + vpaddq ymm0,ymm0,YMMWORD[((0-192))+rbx] + vpmuludq ymm1,ymm10,YMMWORD[((0-128))+r9] + vpaddq ymm1,ymm1,YMMWORD[((32-192))+rbx] + vpmuludq ymm2,ymm10,YMMWORD[((32-128))+r9] + vpaddq ymm2,ymm2,YMMWORD[((64-192))+rbx] + vpmuludq ymm3,ymm10,YMMWORD[((64-128))+r9] + vpaddq ymm3,ymm3,YMMWORD[((96-192))+rbx] + vpmuludq ymm4,ymm10,YMMWORD[((96-128))+r9] + vpaddq ymm4,ymm4,YMMWORD[((128-192))+rbx] + vpmuludq ymm5,ymm10,YMMWORD[((128-128))+r9] + vpaddq ymm5,ymm5,YMMWORD[((160-192))+rbx] + vpmuludq ymm6,ymm10,YMMWORD[((160-128))+r9] + vpaddq ymm6,ymm6,YMMWORD[((192-192))+rbx] + vpmuludq ymm7,ymm10,YMMWORD[((192-128))+r9] + vpaddq ymm7,ymm7,YMMWORD[((224-192))+rbx] + vpmuludq ymm8,ymm10,YMMWORD[((224-128))+r9] + vpbroadcastq ymm10,QWORD[((64-128))+r15] + vpaddq ymm8,ymm8,YMMWORD[((256-192))+rbx] +$L$sqr_entry_1024: + vmovdqu YMMWORD[(0-192)+rbx],ymm0 + vmovdqu YMMWORD[(32-192)+rbx],ymm1 + + vpmuludq ymm12,ymm11,YMMWORD[((32-128))+rsi] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm14,ymm11,YMMWORD[((32-128))+r9] + vpaddq ymm3,ymm3,ymm14 + vpmuludq ymm13,ymm11,YMMWORD[((64-128))+r9] + vpaddq ymm4,ymm4,ymm13 + vpmuludq ymm12,ymm11,YMMWORD[((96-128))+r9] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm14,ymm11,YMMWORD[((128-128))+r9] + vpaddq ymm6,ymm6,ymm14 + vpmuludq ymm13,ymm11,YMMWORD[((160-128))+r9] + vpaddq ymm7,ymm7,ymm13 + vpmuludq ymm12,ymm11,YMMWORD[((192-128))+r9] + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm0,ymm11,YMMWORD[((224-128))+r9] + vpbroadcastq ymm11,QWORD[((96-128))+r15] + vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx] + + vmovdqu YMMWORD[(64-192)+rbx],ymm2 + vmovdqu YMMWORD[(96-192)+rbx],ymm3 + + vpmuludq ymm13,ymm10,YMMWORD[((64-128))+rsi] + vpaddq ymm4,ymm4,ymm13 + vpmuludq ymm12,ymm10,YMMWORD[((64-128))+r9] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm14,ymm10,YMMWORD[((96-128))+r9] + vpaddq ymm6,ymm6,ymm14 + vpmuludq ymm13,ymm10,YMMWORD[((128-128))+r9] + vpaddq ymm7,ymm7,ymm13 + vpmuludq ymm12,ymm10,YMMWORD[((160-128))+r9] + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9] + vpaddq ymm0,ymm0,ymm14 + vpmuludq ymm1,ymm10,YMMWORD[((224-128))+r9] + vpbroadcastq ymm10,QWORD[((128-128))+r15] + vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12] + + vmovdqu YMMWORD[(128-192)+rbx],ymm4 + vmovdqu YMMWORD[(160-192)+rbx],ymm5 + + vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rsi] + vpaddq ymm6,ymm6,ymm12 + vpmuludq ymm14,ymm11,YMMWORD[((96-128))+r9] + vpaddq ymm7,ymm7,ymm14 + vpmuludq ymm13,ymm11,YMMWORD[((128-128))+r9] + vpaddq ymm8,ymm8,ymm13 + vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9] + vpaddq ymm0,ymm0,ymm12 + vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9] + vpaddq ymm1,ymm1,ymm14 + vpmuludq ymm2,ymm11,YMMWORD[((224-128))+r9] + vpbroadcastq ymm11,QWORD[((160-128))+r15] + vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12] + + vmovdqu YMMWORD[(192-192)+rbx],ymm6 + vmovdqu YMMWORD[(224-192)+rbx],ymm7 + + vpmuludq ymm12,ymm10,YMMWORD[((128-128))+rsi] + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm14,ymm10,YMMWORD[((128-128))+r9] + vpaddq ymm0,ymm0,ymm14 + vpmuludq ymm13,ymm10,YMMWORD[((160-128))+r9] + vpaddq ymm1,ymm1,ymm13 + vpmuludq ymm12,ymm10,YMMWORD[((192-128))+r9] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm3,ymm10,YMMWORD[((224-128))+r9] + vpbroadcastq ymm10,QWORD[((192-128))+r15] + vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12] + + vmovdqu YMMWORD[(256-192)+rbx],ymm8 + vmovdqu YMMWORD[(288-192)+rbx],ymm0 + lea rbx,[8+rbx] + + vpmuludq ymm13,ymm11,YMMWORD[((160-128))+rsi] + vpaddq ymm1,ymm1,ymm13 + vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9] + vpaddq ymm3,ymm3,ymm14 + vpmuludq ymm4,ymm11,YMMWORD[((224-128))+r9] + vpbroadcastq ymm11,QWORD[((224-128))+r15] + vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12] + + vmovdqu YMMWORD[(320-448)+r12],ymm1 + vmovdqu YMMWORD[(352-448)+r12],ymm2 + + vpmuludq ymm12,ymm10,YMMWORD[((192-128))+rsi] + vpaddq ymm3,ymm3,ymm12 + vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9] + vpbroadcastq ymm0,QWORD[((256-128))+r15] + vpaddq ymm4,ymm4,ymm14 + vpmuludq ymm5,ymm10,YMMWORD[((224-128))+r9] + vpbroadcastq ymm10,QWORD[((0+8-128))+r15] + vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12] + + vmovdqu YMMWORD[(384-448)+r12],ymm3 + vmovdqu YMMWORD[(416-448)+r12],ymm4 + lea r15,[8+r15] + + vpmuludq ymm12,ymm11,YMMWORD[((224-128))+rsi] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm6,ymm11,YMMWORD[((224-128))+r9] + vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12] + + vpmuludq ymm7,ymm0,YMMWORD[((256-128))+rsi] + vmovdqu YMMWORD[(448-448)+r12],ymm5 + vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12] + vmovdqu YMMWORD[(480-448)+r12],ymm6 + vmovdqu YMMWORD[(512-448)+r12],ymm7 + lea r12,[8+r12] + + dec r14d + jnz NEAR $L$OOP_SQR_1024 + + vmovdqu ymm8,YMMWORD[256+rsp] + vmovdqu ymm1,YMMWORD[288+rsp] + vmovdqu ymm2,YMMWORD[320+rsp] + lea rbx,[192+rsp] + + vpsrlq ymm14,ymm8,29 + vpand ymm8,ymm8,ymm15 + vpsrlq ymm11,ymm1,29 + vpand ymm1,ymm1,ymm15 + + vpermq ymm14,ymm14,0x93 + vpxor ymm9,ymm9,ymm9 + vpermq ymm11,ymm11,0x93 + + vpblendd ymm10,ymm14,ymm9,3 + vpblendd ymm14,ymm11,ymm14,3 + vpaddq ymm8,ymm8,ymm10 + vpblendd ymm11,ymm9,ymm11,3 + vpaddq ymm1,ymm1,ymm14 + vpaddq ymm2,ymm2,ymm11 + vmovdqu YMMWORD[(288-192)+rbx],ymm1 + vmovdqu YMMWORD[(320-192)+rbx],ymm2 + + mov rax,QWORD[rsp] + mov r10,QWORD[8+rsp] + mov r11,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + vmovdqu ymm1,YMMWORD[32+rsp] + vmovdqu ymm2,YMMWORD[((64-192))+rbx] + vmovdqu ymm3,YMMWORD[((96-192))+rbx] + vmovdqu ymm4,YMMWORD[((128-192))+rbx] + vmovdqu ymm5,YMMWORD[((160-192))+rbx] + vmovdqu ymm6,YMMWORD[((192-192))+rbx] + vmovdqu ymm7,YMMWORD[((224-192))+rbx] + + mov r9,rax + imul eax,ecx + and eax,0x1fffffff + vmovd xmm12,eax + + mov rdx,rax + imul rax,QWORD[((-128))+r13] + vpbroadcastq ymm12,xmm12 + add r9,rax + mov rax,rdx + imul rax,QWORD[((8-128))+r13] + shr r9,29 + add r10,rax + mov rax,rdx + imul rax,QWORD[((16-128))+r13] + add r10,r9 + add r11,rax + imul rdx,QWORD[((24-128))+r13] + add r12,rdx + + mov rax,r10 + imul eax,ecx + and eax,0x1fffffff + + mov r14d,9 + jmp NEAR $L$OOP_REDUCE_1024 + +ALIGN 32 +$L$OOP_REDUCE_1024: + vmovd xmm13,eax + vpbroadcastq ymm13,xmm13 + + vpmuludq ymm10,ymm12,YMMWORD[((32-128))+r13] + mov rdx,rax + imul rax,QWORD[((-128))+r13] + vpaddq ymm1,ymm1,ymm10 + add r10,rax + vpmuludq ymm14,ymm12,YMMWORD[((64-128))+r13] + mov rax,rdx + imul rax,QWORD[((8-128))+r13] + vpaddq ymm2,ymm2,ymm14 + vpmuludq ymm11,ymm12,YMMWORD[((96-128))+r13] +DB 0x67 + add r11,rax +DB 0x67 + mov rax,rdx + imul rax,QWORD[((16-128))+r13] + shr r10,29 + vpaddq ymm3,ymm3,ymm11 + vpmuludq ymm10,ymm12,YMMWORD[((128-128))+r13] + add r12,rax + add r11,r10 + vpaddq ymm4,ymm4,ymm10 + vpmuludq ymm14,ymm12,YMMWORD[((160-128))+r13] + mov rax,r11 + imul eax,ecx + vpaddq ymm5,ymm5,ymm14 + vpmuludq ymm11,ymm12,YMMWORD[((192-128))+r13] + and eax,0x1fffffff + vpaddq ymm6,ymm6,ymm11 + vpmuludq ymm10,ymm12,YMMWORD[((224-128))+r13] + vpaddq ymm7,ymm7,ymm10 + vpmuludq ymm14,ymm12,YMMWORD[((256-128))+r13] + vmovd xmm12,eax + + vpaddq ymm8,ymm8,ymm14 + + vpbroadcastq ymm12,xmm12 + + vpmuludq ymm11,ymm13,YMMWORD[((32-8-128))+r13] + vmovdqu ymm14,YMMWORD[((96-8-128))+r13] + mov rdx,rax + imul rax,QWORD[((-128))+r13] + vpaddq ymm1,ymm1,ymm11 + vpmuludq ymm10,ymm13,YMMWORD[((64-8-128))+r13] + vmovdqu ymm11,YMMWORD[((128-8-128))+r13] + add r11,rax + mov rax,rdx + imul rax,QWORD[((8-128))+r13] + vpaddq ymm2,ymm2,ymm10 + add rax,r12 + shr r11,29 + vpmuludq ymm14,ymm14,ymm13 + vmovdqu ymm10,YMMWORD[((160-8-128))+r13] + add rax,r11 + vpaddq ymm3,ymm3,ymm14 + vpmuludq ymm11,ymm11,ymm13 + vmovdqu ymm14,YMMWORD[((192-8-128))+r13] +DB 0x67 + mov r12,rax + imul eax,ecx + vpaddq ymm4,ymm4,ymm11 + vpmuludq ymm10,ymm10,ymm13 +DB 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 + and eax,0x1fffffff + vpaddq ymm5,ymm5,ymm10 + vpmuludq ymm14,ymm14,ymm13 + vmovdqu ymm10,YMMWORD[((256-8-128))+r13] + vpaddq ymm6,ymm6,ymm14 + vpmuludq ymm11,ymm11,ymm13 + vmovdqu ymm9,YMMWORD[((288-8-128))+r13] + vmovd xmm0,eax + imul rax,QWORD[((-128))+r13] + vpaddq ymm7,ymm7,ymm11 + vpmuludq ymm10,ymm10,ymm13 + vmovdqu ymm14,YMMWORD[((32-16-128))+r13] + vpbroadcastq ymm0,xmm0 + vpaddq ymm8,ymm8,ymm10 + vpmuludq ymm9,ymm9,ymm13 + vmovdqu ymm11,YMMWORD[((64-16-128))+r13] + add r12,rax + + vmovdqu ymm13,YMMWORD[((32-24-128))+r13] + vpmuludq ymm14,ymm14,ymm12 + vmovdqu ymm10,YMMWORD[((96-16-128))+r13] + vpaddq ymm1,ymm1,ymm14 + vpmuludq ymm13,ymm13,ymm0 + vpmuludq ymm11,ymm11,ymm12 +DB 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff + vpaddq ymm13,ymm13,ymm1 + vpaddq ymm2,ymm2,ymm11 + vpmuludq ymm10,ymm10,ymm12 + vmovdqu ymm11,YMMWORD[((160-16-128))+r13] +DB 0x67 + vmovq rax,xmm13 + vmovdqu YMMWORD[rsp],ymm13 + vpaddq ymm3,ymm3,ymm10 + vpmuludq ymm14,ymm14,ymm12 + vmovdqu ymm10,YMMWORD[((192-16-128))+r13] + vpaddq ymm4,ymm4,ymm14 + vpmuludq ymm11,ymm11,ymm12 + vmovdqu ymm14,YMMWORD[((224-16-128))+r13] + vpaddq ymm5,ymm5,ymm11 + vpmuludq ymm10,ymm10,ymm12 + vmovdqu ymm11,YMMWORD[((256-16-128))+r13] + vpaddq ymm6,ymm6,ymm10 + vpmuludq ymm14,ymm14,ymm12 + shr r12,29 + vmovdqu ymm10,YMMWORD[((288-16-128))+r13] + add rax,r12 + vpaddq ymm7,ymm7,ymm14 + vpmuludq ymm11,ymm11,ymm12 + + mov r9,rax + imul eax,ecx + vpaddq ymm8,ymm8,ymm11 + vpmuludq ymm10,ymm10,ymm12 + and eax,0x1fffffff + vmovd xmm12,eax + vmovdqu ymm11,YMMWORD[((96-24-128))+r13] +DB 0x67 + vpaddq ymm9,ymm9,ymm10 + vpbroadcastq ymm12,xmm12 + + vpmuludq ymm14,ymm0,YMMWORD[((64-24-128))+r13] + vmovdqu ymm10,YMMWORD[((128-24-128))+r13] + mov rdx,rax + imul rax,QWORD[((-128))+r13] + mov r10,QWORD[8+rsp] + vpaddq ymm1,ymm2,ymm14 + vpmuludq ymm11,ymm11,ymm0 + vmovdqu ymm14,YMMWORD[((160-24-128))+r13] + add r9,rax + mov rax,rdx + imul rax,QWORD[((8-128))+r13] +DB 0x67 + shr r9,29 + mov r11,QWORD[16+rsp] + vpaddq ymm2,ymm3,ymm11 + vpmuludq ymm10,ymm10,ymm0 + vmovdqu ymm11,YMMWORD[((192-24-128))+r13] + add r10,rax + mov rax,rdx + imul rax,QWORD[((16-128))+r13] + vpaddq ymm3,ymm4,ymm10 + vpmuludq ymm14,ymm14,ymm0 + vmovdqu ymm10,YMMWORD[((224-24-128))+r13] + imul rdx,QWORD[((24-128))+r13] + add r11,rax + lea rax,[r10*1+r9] + vpaddq ymm4,ymm5,ymm14 + vpmuludq ymm11,ymm11,ymm0 + vmovdqu ymm14,YMMWORD[((256-24-128))+r13] + mov r10,rax + imul eax,ecx + vpmuludq ymm10,ymm10,ymm0 + vpaddq ymm5,ymm6,ymm11 + vmovdqu ymm11,YMMWORD[((288-24-128))+r13] + and eax,0x1fffffff + vpaddq ymm6,ymm7,ymm10 + vpmuludq ymm14,ymm14,ymm0 + add rdx,QWORD[24+rsp] + vpaddq ymm7,ymm8,ymm14 + vpmuludq ymm11,ymm11,ymm0 + vpaddq ymm8,ymm9,ymm11 + vmovq xmm9,r12 + mov r12,rdx + + dec r14d + jnz NEAR $L$OOP_REDUCE_1024 + lea r12,[448+rsp] + vpaddq ymm0,ymm13,ymm9 + vpxor ymm9,ymm9,ymm9 + + vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx] + vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12] + vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12] + vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12] + vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12] + vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12] + vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12] + vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12] + vpaddq ymm8,ymm8,YMMWORD[((544-448))+r12] + + vpsrlq ymm14,ymm0,29 + vpand ymm0,ymm0,ymm15 + vpsrlq ymm11,ymm1,29 + vpand ymm1,ymm1,ymm15 + vpsrlq ymm12,ymm2,29 + vpermq ymm14,ymm14,0x93 + vpand ymm2,ymm2,ymm15 + vpsrlq ymm13,ymm3,29 + vpermq ymm11,ymm11,0x93 + vpand ymm3,ymm3,ymm15 + vpermq ymm12,ymm12,0x93 + + vpblendd ymm10,ymm14,ymm9,3 + vpermq ymm13,ymm13,0x93 + vpblendd ymm14,ymm11,ymm14,3 + vpaddq ymm0,ymm0,ymm10 + vpblendd ymm11,ymm12,ymm11,3 + vpaddq ymm1,ymm1,ymm14 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm2,ymm2,ymm11 + vpblendd ymm13,ymm9,ymm13,3 + vpaddq ymm3,ymm3,ymm12 + vpaddq ymm4,ymm4,ymm13 + + vpsrlq ymm14,ymm0,29 + vpand ymm0,ymm0,ymm15 + vpsrlq ymm11,ymm1,29 + vpand ymm1,ymm1,ymm15 + vpsrlq ymm12,ymm2,29 + vpermq ymm14,ymm14,0x93 + vpand ymm2,ymm2,ymm15 + vpsrlq ymm13,ymm3,29 + vpermq ymm11,ymm11,0x93 + vpand ymm3,ymm3,ymm15 + vpermq ymm12,ymm12,0x93 + + vpblendd ymm10,ymm14,ymm9,3 + vpermq ymm13,ymm13,0x93 + vpblendd ymm14,ymm11,ymm14,3 + vpaddq ymm0,ymm0,ymm10 + vpblendd ymm11,ymm12,ymm11,3 + vpaddq ymm1,ymm1,ymm14 + vmovdqu YMMWORD[(0-128)+rdi],ymm0 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm2,ymm2,ymm11 + vmovdqu YMMWORD[(32-128)+rdi],ymm1 + vpblendd ymm13,ymm9,ymm13,3 + vpaddq ymm3,ymm3,ymm12 + vmovdqu YMMWORD[(64-128)+rdi],ymm2 + vpaddq ymm4,ymm4,ymm13 + vmovdqu YMMWORD[(96-128)+rdi],ymm3 + vpsrlq ymm14,ymm4,29 + vpand ymm4,ymm4,ymm15 + vpsrlq ymm11,ymm5,29 + vpand ymm5,ymm5,ymm15 + vpsrlq ymm12,ymm6,29 + vpermq ymm14,ymm14,0x93 + vpand ymm6,ymm6,ymm15 + vpsrlq ymm13,ymm7,29 + vpermq ymm11,ymm11,0x93 + vpand ymm7,ymm7,ymm15 + vpsrlq ymm0,ymm8,29 + vpermq ymm12,ymm12,0x93 + vpand ymm8,ymm8,ymm15 + vpermq ymm13,ymm13,0x93 + + vpblendd ymm10,ymm14,ymm9,3 + vpermq ymm0,ymm0,0x93 + vpblendd ymm14,ymm11,ymm14,3 + vpaddq ymm4,ymm4,ymm10 + vpblendd ymm11,ymm12,ymm11,3 + vpaddq ymm5,ymm5,ymm14 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm6,ymm6,ymm11 + vpblendd ymm13,ymm0,ymm13,3 + vpaddq ymm7,ymm7,ymm12 + vpaddq ymm8,ymm8,ymm13 + + vpsrlq ymm14,ymm4,29 + vpand ymm4,ymm4,ymm15 + vpsrlq ymm11,ymm5,29 + vpand ymm5,ymm5,ymm15 + vpsrlq ymm12,ymm6,29 + vpermq ymm14,ymm14,0x93 + vpand ymm6,ymm6,ymm15 + vpsrlq ymm13,ymm7,29 + vpermq ymm11,ymm11,0x93 + vpand ymm7,ymm7,ymm15 + vpsrlq ymm0,ymm8,29 + vpermq ymm12,ymm12,0x93 + vpand ymm8,ymm8,ymm15 + vpermq ymm13,ymm13,0x93 + + vpblendd ymm10,ymm14,ymm9,3 + vpermq ymm0,ymm0,0x93 + vpblendd ymm14,ymm11,ymm14,3 + vpaddq ymm4,ymm4,ymm10 + vpblendd ymm11,ymm12,ymm11,3 + vpaddq ymm5,ymm5,ymm14 + vmovdqu YMMWORD[(128-128)+rdi],ymm4 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm6,ymm6,ymm11 + vmovdqu YMMWORD[(160-128)+rdi],ymm5 + vpblendd ymm13,ymm0,ymm13,3 + vpaddq ymm7,ymm7,ymm12 + vmovdqu YMMWORD[(192-128)+rdi],ymm6 + vpaddq ymm8,ymm8,ymm13 + vmovdqu YMMWORD[(224-128)+rdi],ymm7 + vmovdqu YMMWORD[(256-128)+rdi],ymm8 + + mov rsi,rdi + dec r8d + jne NEAR $L$OOP_GRANDE_SQR_1024 + + vzeroall + mov rax,rbp + +$L$sqr_1024_in_tail: + movaps xmm6,XMMWORD[((-216))+rax] + movaps xmm7,XMMWORD[((-200))+rax] + movaps xmm8,XMMWORD[((-184))+rax] + movaps xmm9,XMMWORD[((-168))+rax] + movaps xmm10,XMMWORD[((-152))+rax] + movaps xmm11,XMMWORD[((-136))+rax] + movaps xmm12,XMMWORD[((-120))+rax] + movaps xmm13,XMMWORD[((-104))+rax] + movaps xmm14,XMMWORD[((-88))+rax] + movaps xmm15,XMMWORD[((-72))+rax] + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$sqr_1024_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_rsaz_1024_sqr_avx2: +global rsaz_1024_mul_avx2 + +ALIGN 64 +rsaz_1024_mul_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_rsaz_1024_mul_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + + lea rax,[rsp] + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + vzeroupper + lea rsp,[((-168))+rsp] + vmovaps XMMWORD[(-216)+rax],xmm6 + vmovaps XMMWORD[(-200)+rax],xmm7 + vmovaps XMMWORD[(-184)+rax],xmm8 + vmovaps XMMWORD[(-168)+rax],xmm9 + vmovaps XMMWORD[(-152)+rax],xmm10 + vmovaps XMMWORD[(-136)+rax],xmm11 + vmovaps XMMWORD[(-120)+rax],xmm12 + vmovaps XMMWORD[(-104)+rax],xmm13 + vmovaps XMMWORD[(-88)+rax],xmm14 + vmovaps XMMWORD[(-72)+rax],xmm15 +$L$mul_1024_body: + mov rbp,rax + + vzeroall + mov r13,rdx + sub rsp,64 + + + + + + +DB 0x67,0x67 + mov r15,rsi + and r15,4095 + add r15,32*10 + shr r15,12 + mov r15,rsi + cmovnz rsi,r13 + cmovnz r13,r15 + + mov r15,rcx + sub rsi,-128 + sub rcx,-128 + sub rdi,-128 + + and r15,4095 + add r15,32*10 +DB 0x67,0x67 + shr r15,12 + jz NEAR $L$mul_1024_no_n_copy + + + + + + sub rsp,32*10 + vmovdqu ymm0,YMMWORD[((0-128))+rcx] + and rsp,-512 + vmovdqu ymm1,YMMWORD[((32-128))+rcx] + vmovdqu ymm2,YMMWORD[((64-128))+rcx] + vmovdqu ymm3,YMMWORD[((96-128))+rcx] + vmovdqu ymm4,YMMWORD[((128-128))+rcx] + vmovdqu ymm5,YMMWORD[((160-128))+rcx] + vmovdqu ymm6,YMMWORD[((192-128))+rcx] + vmovdqu ymm7,YMMWORD[((224-128))+rcx] + vmovdqu ymm8,YMMWORD[((256-128))+rcx] + lea rcx,[((64+128))+rsp] + vmovdqu YMMWORD[(0-128)+rcx],ymm0 + vpxor ymm0,ymm0,ymm0 + vmovdqu YMMWORD[(32-128)+rcx],ymm1 + vpxor ymm1,ymm1,ymm1 + vmovdqu YMMWORD[(64-128)+rcx],ymm2 + vpxor ymm2,ymm2,ymm2 + vmovdqu YMMWORD[(96-128)+rcx],ymm3 + vpxor ymm3,ymm3,ymm3 + vmovdqu YMMWORD[(128-128)+rcx],ymm4 + vpxor ymm4,ymm4,ymm4 + vmovdqu YMMWORD[(160-128)+rcx],ymm5 + vpxor ymm5,ymm5,ymm5 + vmovdqu YMMWORD[(192-128)+rcx],ymm6 + vpxor ymm6,ymm6,ymm6 + vmovdqu YMMWORD[(224-128)+rcx],ymm7 + vpxor ymm7,ymm7,ymm7 + vmovdqu YMMWORD[(256-128)+rcx],ymm8 + vmovdqa ymm8,ymm0 + vmovdqu YMMWORD[(288-128)+rcx],ymm9 +$L$mul_1024_no_n_copy: + and rsp,-64 + + mov rbx,QWORD[r13] + vpbroadcastq ymm10,QWORD[r13] + vmovdqu YMMWORD[rsp],ymm0 + xor r9,r9 +DB 0x67 + xor r10,r10 + xor r11,r11 + xor r12,r12 + + vmovdqu ymm15,YMMWORD[$L$and_mask] + mov r14d,9 + vmovdqu YMMWORD[(288-128)+rdi],ymm9 + jmp NEAR $L$oop_mul_1024 + +ALIGN 32 +$L$oop_mul_1024: + vpsrlq ymm9,ymm3,29 + mov rax,rbx + imul rax,QWORD[((-128))+rsi] + add rax,r9 + mov r10,rbx + imul r10,QWORD[((8-128))+rsi] + add r10,QWORD[8+rsp] + + mov r9,rax + imul eax,r8d + and eax,0x1fffffff + + mov r11,rbx + imul r11,QWORD[((16-128))+rsi] + add r11,QWORD[16+rsp] + + mov r12,rbx + imul r12,QWORD[((24-128))+rsi] + add r12,QWORD[24+rsp] + vpmuludq ymm0,ymm10,YMMWORD[((32-128))+rsi] + vmovd xmm11,eax + vpaddq ymm1,ymm1,ymm0 + vpmuludq ymm12,ymm10,YMMWORD[((64-128))+rsi] + vpbroadcastq ymm11,xmm11 + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm13,ymm10,YMMWORD[((96-128))+rsi] + vpand ymm3,ymm3,ymm15 + vpaddq ymm3,ymm3,ymm13 + vpmuludq ymm0,ymm10,YMMWORD[((128-128))+rsi] + vpaddq ymm4,ymm4,ymm0 + vpmuludq ymm12,ymm10,YMMWORD[((160-128))+rsi] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm13,ymm10,YMMWORD[((192-128))+rsi] + vpaddq ymm6,ymm6,ymm13 + vpmuludq ymm0,ymm10,YMMWORD[((224-128))+rsi] + vpermq ymm9,ymm9,0x93 + vpaddq ymm7,ymm7,ymm0 + vpmuludq ymm12,ymm10,YMMWORD[((256-128))+rsi] + vpbroadcastq ymm10,QWORD[8+r13] + vpaddq ymm8,ymm8,ymm12 + + mov rdx,rax + imul rax,QWORD[((-128))+rcx] + add r9,rax + mov rax,rdx + imul rax,QWORD[((8-128))+rcx] + add r10,rax + mov rax,rdx + imul rax,QWORD[((16-128))+rcx] + add r11,rax + shr r9,29 + imul rdx,QWORD[((24-128))+rcx] + add r12,rdx + add r10,r9 + + vpmuludq ymm13,ymm11,YMMWORD[((32-128))+rcx] + vmovq rbx,xmm10 + vpaddq ymm1,ymm1,ymm13 + vpmuludq ymm0,ymm11,YMMWORD[((64-128))+rcx] + vpaddq ymm2,ymm2,ymm0 + vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rcx] + vpaddq ymm3,ymm3,ymm12 + vpmuludq ymm13,ymm11,YMMWORD[((128-128))+rcx] + vpaddq ymm4,ymm4,ymm13 + vpmuludq ymm0,ymm11,YMMWORD[((160-128))+rcx] + vpaddq ymm5,ymm5,ymm0 + vpmuludq ymm12,ymm11,YMMWORD[((192-128))+rcx] + vpaddq ymm6,ymm6,ymm12 + vpmuludq ymm13,ymm11,YMMWORD[((224-128))+rcx] + vpblendd ymm9,ymm9,ymm14,3 + vpaddq ymm7,ymm7,ymm13 + vpmuludq ymm0,ymm11,YMMWORD[((256-128))+rcx] + vpaddq ymm3,ymm3,ymm9 + vpaddq ymm8,ymm8,ymm0 + + mov rax,rbx + imul rax,QWORD[((-128))+rsi] + add r10,rax + vmovdqu ymm12,YMMWORD[((-8+32-128))+rsi] + mov rax,rbx + imul rax,QWORD[((8-128))+rsi] + add r11,rax + vmovdqu ymm13,YMMWORD[((-8+64-128))+rsi] + + mov rax,r10 + imul eax,r8d + and eax,0x1fffffff + + imul rbx,QWORD[((16-128))+rsi] + add r12,rbx + vpmuludq ymm12,ymm12,ymm10 + vmovd xmm11,eax + vmovdqu ymm0,YMMWORD[((-8+96-128))+rsi] + vpaddq ymm1,ymm1,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vpbroadcastq ymm11,xmm11 + vmovdqu ymm12,YMMWORD[((-8+128-128))+rsi] + vpaddq ymm2,ymm2,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-8+160-128))+rsi] + vpaddq ymm3,ymm3,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vmovdqu ymm0,YMMWORD[((-8+192-128))+rsi] + vpaddq ymm4,ymm4,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vmovdqu ymm12,YMMWORD[((-8+224-128))+rsi] + vpaddq ymm5,ymm5,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-8+256-128))+rsi] + vpaddq ymm6,ymm6,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vmovdqu ymm9,YMMWORD[((-8+288-128))+rsi] + vpaddq ymm7,ymm7,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vpaddq ymm8,ymm8,ymm13 + vpmuludq ymm9,ymm9,ymm10 + vpbroadcastq ymm10,QWORD[16+r13] + + mov rdx,rax + imul rax,QWORD[((-128))+rcx] + add r10,rax + vmovdqu ymm0,YMMWORD[((-8+32-128))+rcx] + mov rax,rdx + imul rax,QWORD[((8-128))+rcx] + add r11,rax + vmovdqu ymm12,YMMWORD[((-8+64-128))+rcx] + shr r10,29 + imul rdx,QWORD[((16-128))+rcx] + add r12,rdx + add r11,r10 + + vpmuludq ymm0,ymm0,ymm11 + vmovq rbx,xmm10 + vmovdqu ymm13,YMMWORD[((-8+96-128))+rcx] + vpaddq ymm1,ymm1,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-8+128-128))+rcx] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-8+160-128))+rcx] + vpaddq ymm3,ymm3,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-8+192-128))+rcx] + vpaddq ymm4,ymm4,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-8+224-128))+rcx] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-8+256-128))+rcx] + vpaddq ymm6,ymm6,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-8+288-128))+rcx] + vpaddq ymm7,ymm7,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vpaddq ymm9,ymm9,ymm13 + + vmovdqu ymm0,YMMWORD[((-16+32-128))+rsi] + mov rax,rbx + imul rax,QWORD[((-128))+rsi] + add rax,r11 + + vmovdqu ymm12,YMMWORD[((-16+64-128))+rsi] + mov r11,rax + imul eax,r8d + and eax,0x1fffffff + + imul rbx,QWORD[((8-128))+rsi] + add r12,rbx + vpmuludq ymm0,ymm0,ymm10 + vmovd xmm11,eax + vmovdqu ymm13,YMMWORD[((-16+96-128))+rsi] + vpaddq ymm1,ymm1,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vpbroadcastq ymm11,xmm11 + vmovdqu ymm0,YMMWORD[((-16+128-128))+rsi] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vmovdqu ymm12,YMMWORD[((-16+160-128))+rsi] + vpaddq ymm3,ymm3,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-16+192-128))+rsi] + vpaddq ymm4,ymm4,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vmovdqu ymm0,YMMWORD[((-16+224-128))+rsi] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vmovdqu ymm12,YMMWORD[((-16+256-128))+rsi] + vpaddq ymm6,ymm6,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-16+288-128))+rsi] + vpaddq ymm7,ymm7,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vpbroadcastq ymm10,QWORD[24+r13] + vpaddq ymm9,ymm9,ymm13 + + vmovdqu ymm0,YMMWORD[((-16+32-128))+rcx] + mov rdx,rax + imul rax,QWORD[((-128))+rcx] + add r11,rax + vmovdqu ymm12,YMMWORD[((-16+64-128))+rcx] + imul rdx,QWORD[((8-128))+rcx] + add r12,rdx + shr r11,29 + + vpmuludq ymm0,ymm0,ymm11 + vmovq rbx,xmm10 + vmovdqu ymm13,YMMWORD[((-16+96-128))+rcx] + vpaddq ymm1,ymm1,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-16+128-128))+rcx] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-16+160-128))+rcx] + vpaddq ymm3,ymm3,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-16+192-128))+rcx] + vpaddq ymm4,ymm4,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-16+224-128))+rcx] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-16+256-128))+rcx] + vpaddq ymm6,ymm6,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-16+288-128))+rcx] + vpaddq ymm7,ymm7,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-24+32-128))+rsi] + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-24+64-128))+rsi] + vpaddq ymm9,ymm9,ymm13 + + add r12,r11 + imul rbx,QWORD[((-128))+rsi] + add r12,rbx + + mov rax,r12 + imul eax,r8d + and eax,0x1fffffff + + vpmuludq ymm0,ymm0,ymm10 + vmovd xmm11,eax + vmovdqu ymm13,YMMWORD[((-24+96-128))+rsi] + vpaddq ymm1,ymm1,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vpbroadcastq ymm11,xmm11 + vmovdqu ymm0,YMMWORD[((-24+128-128))+rsi] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vmovdqu ymm12,YMMWORD[((-24+160-128))+rsi] + vpaddq ymm3,ymm3,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-24+192-128))+rsi] + vpaddq ymm4,ymm4,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vmovdqu ymm0,YMMWORD[((-24+224-128))+rsi] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vmovdqu ymm12,YMMWORD[((-24+256-128))+rsi] + vpaddq ymm6,ymm6,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-24+288-128))+rsi] + vpaddq ymm7,ymm7,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vpbroadcastq ymm10,QWORD[32+r13] + vpaddq ymm9,ymm9,ymm13 + add r13,32 + + vmovdqu ymm0,YMMWORD[((-24+32-128))+rcx] + imul rax,QWORD[((-128))+rcx] + add r12,rax + shr r12,29 + + vmovdqu ymm12,YMMWORD[((-24+64-128))+rcx] + vpmuludq ymm0,ymm0,ymm11 + vmovq rbx,xmm10 + vmovdqu ymm13,YMMWORD[((-24+96-128))+rcx] + vpaddq ymm0,ymm1,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu YMMWORD[rsp],ymm0 + vpaddq ymm1,ymm2,ymm12 + vmovdqu ymm0,YMMWORD[((-24+128-128))+rcx] + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-24+160-128))+rcx] + vpaddq ymm2,ymm3,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-24+192-128))+rcx] + vpaddq ymm3,ymm4,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-24+224-128))+rcx] + vpaddq ymm4,ymm5,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-24+256-128))+rcx] + vpaddq ymm5,ymm6,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-24+288-128))+rcx] + mov r9,r12 + vpaddq ymm6,ymm7,ymm0 + vpmuludq ymm12,ymm12,ymm11 + add r9,QWORD[rsp] + vpaddq ymm7,ymm8,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovq xmm12,r12 + vpaddq ymm8,ymm9,ymm13 + + dec r14d + jnz NEAR $L$oop_mul_1024 + vpermq ymm15,ymm15,0 + vpaddq ymm0,ymm12,YMMWORD[rsp] + + vpsrlq ymm12,ymm0,29 + vpand ymm0,ymm0,ymm15 + vpsrlq ymm13,ymm1,29 + vpand ymm1,ymm1,ymm15 + vpsrlq ymm10,ymm2,29 + vpermq ymm12,ymm12,0x93 + vpand ymm2,ymm2,ymm15 + vpsrlq ymm11,ymm3,29 + vpermq ymm13,ymm13,0x93 + vpand ymm3,ymm3,ymm15 + + vpblendd ymm9,ymm12,ymm14,3 + vpermq ymm10,ymm10,0x93 + vpblendd ymm12,ymm13,ymm12,3 + vpermq ymm11,ymm11,0x93 + vpaddq ymm0,ymm0,ymm9 + vpblendd ymm13,ymm10,ymm13,3 + vpaddq ymm1,ymm1,ymm12 + vpblendd ymm10,ymm11,ymm10,3 + vpaddq ymm2,ymm2,ymm13 + vpblendd ymm11,ymm14,ymm11,3 + vpaddq ymm3,ymm3,ymm10 + vpaddq ymm4,ymm4,ymm11 + + vpsrlq ymm12,ymm0,29 + vpand ymm0,ymm0,ymm15 + vpsrlq ymm13,ymm1,29 + vpand ymm1,ymm1,ymm15 + vpsrlq ymm10,ymm2,29 + vpermq ymm12,ymm12,0x93 + vpand ymm2,ymm2,ymm15 + vpsrlq ymm11,ymm3,29 + vpermq ymm13,ymm13,0x93 + vpand ymm3,ymm3,ymm15 + vpermq ymm10,ymm10,0x93 + + vpblendd ymm9,ymm12,ymm14,3 + vpermq ymm11,ymm11,0x93 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm0,ymm0,ymm9 + vpblendd ymm13,ymm10,ymm13,3 + vpaddq ymm1,ymm1,ymm12 + vpblendd ymm10,ymm11,ymm10,3 + vpaddq ymm2,ymm2,ymm13 + vpblendd ymm11,ymm14,ymm11,3 + vpaddq ymm3,ymm3,ymm10 + vpaddq ymm4,ymm4,ymm11 + + vmovdqu YMMWORD[(0-128)+rdi],ymm0 + vmovdqu YMMWORD[(32-128)+rdi],ymm1 + vmovdqu YMMWORD[(64-128)+rdi],ymm2 + vmovdqu YMMWORD[(96-128)+rdi],ymm3 + vpsrlq ymm12,ymm4,29 + vpand ymm4,ymm4,ymm15 + vpsrlq ymm13,ymm5,29 + vpand ymm5,ymm5,ymm15 + vpsrlq ymm10,ymm6,29 + vpermq ymm12,ymm12,0x93 + vpand ymm6,ymm6,ymm15 + vpsrlq ymm11,ymm7,29 + vpermq ymm13,ymm13,0x93 + vpand ymm7,ymm7,ymm15 + vpsrlq ymm0,ymm8,29 + vpermq ymm10,ymm10,0x93 + vpand ymm8,ymm8,ymm15 + vpermq ymm11,ymm11,0x93 + + vpblendd ymm9,ymm12,ymm14,3 + vpermq ymm0,ymm0,0x93 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm4,ymm4,ymm9 + vpblendd ymm13,ymm10,ymm13,3 + vpaddq ymm5,ymm5,ymm12 + vpblendd ymm10,ymm11,ymm10,3 + vpaddq ymm6,ymm6,ymm13 + vpblendd ymm11,ymm0,ymm11,3 + vpaddq ymm7,ymm7,ymm10 + vpaddq ymm8,ymm8,ymm11 + + vpsrlq ymm12,ymm4,29 + vpand ymm4,ymm4,ymm15 + vpsrlq ymm13,ymm5,29 + vpand ymm5,ymm5,ymm15 + vpsrlq ymm10,ymm6,29 + vpermq ymm12,ymm12,0x93 + vpand ymm6,ymm6,ymm15 + vpsrlq ymm11,ymm7,29 + vpermq ymm13,ymm13,0x93 + vpand ymm7,ymm7,ymm15 + vpsrlq ymm0,ymm8,29 + vpermq ymm10,ymm10,0x93 + vpand ymm8,ymm8,ymm15 + vpermq ymm11,ymm11,0x93 + + vpblendd ymm9,ymm12,ymm14,3 + vpermq ymm0,ymm0,0x93 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm4,ymm4,ymm9 + vpblendd ymm13,ymm10,ymm13,3 + vpaddq ymm5,ymm5,ymm12 + vpblendd ymm10,ymm11,ymm10,3 + vpaddq ymm6,ymm6,ymm13 + vpblendd ymm11,ymm0,ymm11,3 + vpaddq ymm7,ymm7,ymm10 + vpaddq ymm8,ymm8,ymm11 + + vmovdqu YMMWORD[(128-128)+rdi],ymm4 + vmovdqu YMMWORD[(160-128)+rdi],ymm5 + vmovdqu YMMWORD[(192-128)+rdi],ymm6 + vmovdqu YMMWORD[(224-128)+rdi],ymm7 + vmovdqu YMMWORD[(256-128)+rdi],ymm8 + vzeroupper + + mov rax,rbp + +$L$mul_1024_in_tail: + movaps xmm6,XMMWORD[((-216))+rax] + movaps xmm7,XMMWORD[((-200))+rax] + movaps xmm8,XMMWORD[((-184))+rax] + movaps xmm9,XMMWORD[((-168))+rax] + movaps xmm10,XMMWORD[((-152))+rax] + movaps xmm11,XMMWORD[((-136))+rax] + movaps xmm12,XMMWORD[((-120))+rax] + movaps xmm13,XMMWORD[((-104))+rax] + movaps xmm14,XMMWORD[((-88))+rax] + movaps xmm15,XMMWORD[((-72))+rax] + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$mul_1024_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + DB 0F3h,0C3h ;repret + +$L$SEH_end_rsaz_1024_mul_avx2: +global rsaz_1024_red2norm_avx2 + +ALIGN 32 +rsaz_1024_red2norm_avx2: + sub rdx,-128 + xor rax,rax + mov r8,QWORD[((-128))+rdx] + mov r9,QWORD[((-120))+rdx] + mov r10,QWORD[((-112))+rdx] + shl r8,0 + shl r9,29 + mov r11,r10 + shl r10,58 + shr r11,6 + add rax,r8 + add rax,r9 + add rax,r10 + adc r11,0 + mov QWORD[rcx],rax + mov rax,r11 + mov r8,QWORD[((-104))+rdx] + mov r9,QWORD[((-96))+rdx] + shl r8,23 + mov r10,r9 + shl r9,52 + shr r10,12 + add rax,r8 + add rax,r9 + adc r10,0 + mov QWORD[8+rcx],rax + mov rax,r10 + mov r11,QWORD[((-88))+rdx] + mov r8,QWORD[((-80))+rdx] + shl r11,17 + mov r9,r8 + shl r8,46 + shr r9,18 + add rax,r11 + add rax,r8 + adc r9,0 + mov QWORD[16+rcx],rax + mov rax,r9 + mov r10,QWORD[((-72))+rdx] + mov r11,QWORD[((-64))+rdx] + shl r10,11 + mov r8,r11 + shl r11,40 + shr r8,24 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[24+rcx],rax + mov rax,r8 + mov r9,QWORD[((-56))+rdx] + mov r10,QWORD[((-48))+rdx] + mov r11,QWORD[((-40))+rdx] + shl r9,5 + shl r10,34 + mov r8,r11 + shl r11,63 + shr r8,1 + add rax,r9 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[32+rcx],rax + mov rax,r8 + mov r9,QWORD[((-32))+rdx] + mov r10,QWORD[((-24))+rdx] + shl r9,28 + mov r11,r10 + shl r10,57 + shr r11,7 + add rax,r9 + add rax,r10 + adc r11,0 + mov QWORD[40+rcx],rax + mov rax,r11 + mov r8,QWORD[((-16))+rdx] + mov r9,QWORD[((-8))+rdx] + shl r8,22 + mov r10,r9 + shl r9,51 + shr r10,13 + add rax,r8 + add rax,r9 + adc r10,0 + mov QWORD[48+rcx],rax + mov rax,r10 + mov r11,QWORD[rdx] + mov r8,QWORD[8+rdx] + shl r11,16 + mov r9,r8 + shl r8,45 + shr r9,19 + add rax,r11 + add rax,r8 + adc r9,0 + mov QWORD[56+rcx],rax + mov rax,r9 + mov r10,QWORD[16+rdx] + mov r11,QWORD[24+rdx] + shl r10,10 + mov r8,r11 + shl r11,39 + shr r8,25 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[64+rcx],rax + mov rax,r8 + mov r9,QWORD[32+rdx] + mov r10,QWORD[40+rdx] + mov r11,QWORD[48+rdx] + shl r9,4 + shl r10,33 + mov r8,r11 + shl r11,62 + shr r8,2 + add rax,r9 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[72+rcx],rax + mov rax,r8 + mov r9,QWORD[56+rdx] + mov r10,QWORD[64+rdx] + shl r9,27 + mov r11,r10 + shl r10,56 + shr r11,8 + add rax,r9 + add rax,r10 + adc r11,0 + mov QWORD[80+rcx],rax + mov rax,r11 + mov r8,QWORD[72+rdx] + mov r9,QWORD[80+rdx] + shl r8,21 + mov r10,r9 + shl r9,50 + shr r10,14 + add rax,r8 + add rax,r9 + adc r10,0 + mov QWORD[88+rcx],rax + mov rax,r10 + mov r11,QWORD[88+rdx] + mov r8,QWORD[96+rdx] + shl r11,15 + mov r9,r8 + shl r8,44 + shr r9,20 + add rax,r11 + add rax,r8 + adc r9,0 + mov QWORD[96+rcx],rax + mov rax,r9 + mov r10,QWORD[104+rdx] + mov r11,QWORD[112+rdx] + shl r10,9 + mov r8,r11 + shl r11,38 + shr r8,26 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[104+rcx],rax + mov rax,r8 + mov r9,QWORD[120+rdx] + mov r10,QWORD[128+rdx] + mov r11,QWORD[136+rdx] + shl r9,3 + shl r10,32 + mov r8,r11 + shl r11,61 + shr r8,3 + add rax,r9 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[112+rcx],rax + mov rax,r8 + mov r9,QWORD[144+rdx] + mov r10,QWORD[152+rdx] + shl r9,26 + mov r11,r10 + shl r10,55 + shr r11,9 + add rax,r9 + add rax,r10 + adc r11,0 + mov QWORD[120+rcx],rax + mov rax,r11 + DB 0F3h,0C3h ;repret + + +global rsaz_1024_norm2red_avx2 + +ALIGN 32 +rsaz_1024_norm2red_avx2: + sub rcx,-128 + mov r8,QWORD[rdx] + mov eax,0x1fffffff + mov r9,QWORD[8+rdx] + mov r11,r8 + shr r11,0 + and r11,rax + mov QWORD[((-128))+rcx],r11 + mov r10,r8 + shr r10,29 + and r10,rax + mov QWORD[((-120))+rcx],r10 + shrd r8,r9,58 + and r8,rax + mov QWORD[((-112))+rcx],r8 + mov r10,QWORD[16+rdx] + mov r8,r9 + shr r8,23 + and r8,rax + mov QWORD[((-104))+rcx],r8 + shrd r9,r10,52 + and r9,rax + mov QWORD[((-96))+rcx],r9 + mov r11,QWORD[24+rdx] + mov r9,r10 + shr r9,17 + and r9,rax + mov QWORD[((-88))+rcx],r9 + shrd r10,r11,46 + and r10,rax + mov QWORD[((-80))+rcx],r10 + mov r8,QWORD[32+rdx] + mov r10,r11 + shr r10,11 + and r10,rax + mov QWORD[((-72))+rcx],r10 + shrd r11,r8,40 + and r11,rax + mov QWORD[((-64))+rcx],r11 + mov r9,QWORD[40+rdx] + mov r11,r8 + shr r11,5 + and r11,rax + mov QWORD[((-56))+rcx],r11 + mov r10,r8 + shr r10,34 + and r10,rax + mov QWORD[((-48))+rcx],r10 + shrd r8,r9,63 + and r8,rax + mov QWORD[((-40))+rcx],r8 + mov r10,QWORD[48+rdx] + mov r8,r9 + shr r8,28 + and r8,rax + mov QWORD[((-32))+rcx],r8 + shrd r9,r10,57 + and r9,rax + mov QWORD[((-24))+rcx],r9 + mov r11,QWORD[56+rdx] + mov r9,r10 + shr r9,22 + and r9,rax + mov QWORD[((-16))+rcx],r9 + shrd r10,r11,51 + and r10,rax + mov QWORD[((-8))+rcx],r10 + mov r8,QWORD[64+rdx] + mov r10,r11 + shr r10,16 + and r10,rax + mov QWORD[rcx],r10 + shrd r11,r8,45 + and r11,rax + mov QWORD[8+rcx],r11 + mov r9,QWORD[72+rdx] + mov r11,r8 + shr r11,10 + and r11,rax + mov QWORD[16+rcx],r11 + shrd r8,r9,39 + and r8,rax + mov QWORD[24+rcx],r8 + mov r10,QWORD[80+rdx] + mov r8,r9 + shr r8,4 + and r8,rax + mov QWORD[32+rcx],r8 + mov r11,r9 + shr r11,33 + and r11,rax + mov QWORD[40+rcx],r11 + shrd r9,r10,62 + and r9,rax + mov QWORD[48+rcx],r9 + mov r11,QWORD[88+rdx] + mov r9,r10 + shr r9,27 + and r9,rax + mov QWORD[56+rcx],r9 + shrd r10,r11,56 + and r10,rax + mov QWORD[64+rcx],r10 + mov r8,QWORD[96+rdx] + mov r10,r11 + shr r10,21 + and r10,rax + mov QWORD[72+rcx],r10 + shrd r11,r8,50 + and r11,rax + mov QWORD[80+rcx],r11 + mov r9,QWORD[104+rdx] + mov r11,r8 + shr r11,15 + and r11,rax + mov QWORD[88+rcx],r11 + shrd r8,r9,44 + and r8,rax + mov QWORD[96+rcx],r8 + mov r10,QWORD[112+rdx] + mov r8,r9 + shr r8,9 + and r8,rax + mov QWORD[104+rcx],r8 + shrd r9,r10,38 + and r9,rax + mov QWORD[112+rcx],r9 + mov r11,QWORD[120+rdx] + mov r9,r10 + shr r9,3 + and r9,rax + mov QWORD[120+rcx],r9 + mov r8,r10 + shr r8,32 + and r8,rax + mov QWORD[128+rcx],r8 + shrd r10,r11,61 + and r10,rax + mov QWORD[136+rcx],r10 + xor r8,r8 + mov r10,r11 + shr r10,26 + and r10,rax + mov QWORD[144+rcx],r10 + shrd r11,r8,55 + and r11,rax + mov QWORD[152+rcx],r11 + mov QWORD[160+rcx],r8 + mov QWORD[168+rcx],r8 + mov QWORD[176+rcx],r8 + mov QWORD[184+rcx],r8 + DB 0F3h,0C3h ;repret + +global rsaz_1024_scatter5_avx2 + +ALIGN 32 +rsaz_1024_scatter5_avx2: + vzeroupper + vmovdqu ymm5,YMMWORD[$L$scatter_permd] + shl r8d,4 + lea rcx,[r8*1+rcx] + mov eax,9 + jmp NEAR $L$oop_scatter_1024 + +ALIGN 32 +$L$oop_scatter_1024: + vmovdqu ymm0,YMMWORD[rdx] + lea rdx,[32+rdx] + vpermd ymm0,ymm5,ymm0 + vmovdqu XMMWORD[rcx],xmm0 + lea rcx,[512+rcx] + dec eax + jnz NEAR $L$oop_scatter_1024 + + vzeroupper + DB 0F3h,0C3h ;repret + + +global rsaz_1024_gather5_avx2 + +ALIGN 32 +rsaz_1024_gather5_avx2: + + vzeroupper + mov r11,rsp + + lea rax,[((-136))+rsp] +$L$SEH_begin_rsaz_1024_gather5: + +DB 0x48,0x8d,0x60,0xe0 +DB 0xc5,0xf8,0x29,0x70,0xe0 +DB 0xc5,0xf8,0x29,0x78,0xf0 +DB 0xc5,0x78,0x29,0x40,0x00 +DB 0xc5,0x78,0x29,0x48,0x10 +DB 0xc5,0x78,0x29,0x50,0x20 +DB 0xc5,0x78,0x29,0x58,0x30 +DB 0xc5,0x78,0x29,0x60,0x40 +DB 0xc5,0x78,0x29,0x68,0x50 +DB 0xc5,0x78,0x29,0x70,0x60 +DB 0xc5,0x78,0x29,0x78,0x70 + lea rsp,[((-256))+rsp] + and rsp,-32 + lea r10,[$L$inc] + lea rax,[((-128))+rsp] + + vmovd xmm4,r8d + vmovdqa ymm0,YMMWORD[r10] + vmovdqa ymm1,YMMWORD[32+r10] + vmovdqa ymm5,YMMWORD[64+r10] + vpbroadcastd ymm4,xmm4 + + vpaddd ymm2,ymm0,ymm5 + vpcmpeqd ymm0,ymm0,ymm4 + vpaddd ymm3,ymm1,ymm5 + vpcmpeqd ymm1,ymm1,ymm4 + vmovdqa YMMWORD[(0+128)+rax],ymm0 + vpaddd ymm0,ymm2,ymm5 + vpcmpeqd ymm2,ymm2,ymm4 + vmovdqa YMMWORD[(32+128)+rax],ymm1 + vpaddd ymm1,ymm3,ymm5 + vpcmpeqd ymm3,ymm3,ymm4 + vmovdqa YMMWORD[(64+128)+rax],ymm2 + vpaddd ymm2,ymm0,ymm5 + vpcmpeqd ymm0,ymm0,ymm4 + vmovdqa YMMWORD[(96+128)+rax],ymm3 + vpaddd ymm3,ymm1,ymm5 + vpcmpeqd ymm1,ymm1,ymm4 + vmovdqa YMMWORD[(128+128)+rax],ymm0 + vpaddd ymm8,ymm2,ymm5 + vpcmpeqd ymm2,ymm2,ymm4 + vmovdqa YMMWORD[(160+128)+rax],ymm1 + vpaddd ymm9,ymm3,ymm5 + vpcmpeqd ymm3,ymm3,ymm4 + vmovdqa YMMWORD[(192+128)+rax],ymm2 + vpaddd ymm10,ymm8,ymm5 + vpcmpeqd ymm8,ymm8,ymm4 + vmovdqa YMMWORD[(224+128)+rax],ymm3 + vpaddd ymm11,ymm9,ymm5 + vpcmpeqd ymm9,ymm9,ymm4 + vpaddd ymm12,ymm10,ymm5 + vpcmpeqd ymm10,ymm10,ymm4 + vpaddd ymm13,ymm11,ymm5 + vpcmpeqd ymm11,ymm11,ymm4 + vpaddd ymm14,ymm12,ymm5 + vpcmpeqd ymm12,ymm12,ymm4 + vpaddd ymm15,ymm13,ymm5 + vpcmpeqd ymm13,ymm13,ymm4 + vpcmpeqd ymm14,ymm14,ymm4 + vpcmpeqd ymm15,ymm15,ymm4 + + vmovdqa ymm7,YMMWORD[((-32))+r10] + lea rdx,[128+rdx] + mov r8d,9 + +$L$oop_gather_1024: + vmovdqa ymm0,YMMWORD[((0-128))+rdx] + vmovdqa ymm1,YMMWORD[((32-128))+rdx] + vmovdqa ymm2,YMMWORD[((64-128))+rdx] + vmovdqa ymm3,YMMWORD[((96-128))+rdx] + vpand ymm0,ymm0,YMMWORD[((0+128))+rax] + vpand ymm1,ymm1,YMMWORD[((32+128))+rax] + vpand ymm2,ymm2,YMMWORD[((64+128))+rax] + vpor ymm4,ymm1,ymm0 + vpand ymm3,ymm3,YMMWORD[((96+128))+rax] + vmovdqa ymm0,YMMWORD[((128-128))+rdx] + vmovdqa ymm1,YMMWORD[((160-128))+rdx] + vpor ymm5,ymm3,ymm2 + vmovdqa ymm2,YMMWORD[((192-128))+rdx] + vmovdqa ymm3,YMMWORD[((224-128))+rdx] + vpand ymm0,ymm0,YMMWORD[((128+128))+rax] + vpand ymm1,ymm1,YMMWORD[((160+128))+rax] + vpand ymm2,ymm2,YMMWORD[((192+128))+rax] + vpor ymm4,ymm4,ymm0 + vpand ymm3,ymm3,YMMWORD[((224+128))+rax] + vpand ymm0,ymm8,YMMWORD[((256-128))+rdx] + vpor ymm5,ymm5,ymm1 + vpand ymm1,ymm9,YMMWORD[((288-128))+rdx] + vpor ymm4,ymm4,ymm2 + vpand ymm2,ymm10,YMMWORD[((320-128))+rdx] + vpor ymm5,ymm5,ymm3 + vpand ymm3,ymm11,YMMWORD[((352-128))+rdx] + vpor ymm4,ymm4,ymm0 + vpand ymm0,ymm12,YMMWORD[((384-128))+rdx] + vpor ymm5,ymm5,ymm1 + vpand ymm1,ymm13,YMMWORD[((416-128))+rdx] + vpor ymm4,ymm4,ymm2 + vpand ymm2,ymm14,YMMWORD[((448-128))+rdx] + vpor ymm5,ymm5,ymm3 + vpand ymm3,ymm15,YMMWORD[((480-128))+rdx] + lea rdx,[512+rdx] + vpor ymm4,ymm4,ymm0 + vpor ymm5,ymm5,ymm1 + vpor ymm4,ymm4,ymm2 + vpor ymm5,ymm5,ymm3 + + vpor ymm4,ymm4,ymm5 + vextracti128 xmm5,ymm4,1 + vpor xmm5,xmm5,xmm4 + vpermd ymm5,ymm7,ymm5 + vmovdqu YMMWORD[rcx],ymm5 + lea rcx,[32+rcx] + dec r8d + jnz NEAR $L$oop_gather_1024 + + vpxor ymm0,ymm0,ymm0 + vmovdqu YMMWORD[rcx],ymm0 + vzeroupper + movaps xmm6,XMMWORD[((-168))+r11] + movaps xmm7,XMMWORD[((-152))+r11] + movaps xmm8,XMMWORD[((-136))+r11] + movaps xmm9,XMMWORD[((-120))+r11] + movaps xmm10,XMMWORD[((-104))+r11] + movaps xmm11,XMMWORD[((-88))+r11] + movaps xmm12,XMMWORD[((-72))+r11] + movaps xmm13,XMMWORD[((-56))+r11] + movaps xmm14,XMMWORD[((-40))+r11] + movaps xmm15,XMMWORD[((-24))+r11] + lea rsp,[r11] + + DB 0F3h,0C3h ;repret + +$L$SEH_end_rsaz_1024_gather5: + +EXTERN OPENSSL_ia32cap_P +global rsaz_avx2_eligible + +ALIGN 32 +rsaz_avx2_eligible: + lea rax,[OPENSSL_ia32cap_P] + mov eax,DWORD[8+rax] + and eax,32 + shr eax,5 + DB 0F3h,0C3h ;repret + + +ALIGN 64 +$L$and_mask: + DQ 0x1fffffff,0x1fffffff,0x1fffffff,-1 +$L$scatter_permd: + DD 0,2,4,6,7,7,7,7 +$L$gather_permd: + DD 0,7,1,7,2,7,3,7 +$L$inc: + DD 0,0,0,0,1,1,1,1 + DD 2,2,2,2,3,3,3,3 + DD 4,4,4,4,4,4,4,4 +ALIGN 64 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +rsaz_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rbp,QWORD[160+r8] + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + cmovc rax,rbp + + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + mov QWORD[240+r8],r15 + mov QWORD[232+r8],r14 + mov QWORD[224+r8],r13 + mov QWORD[216+r8],r12 + mov QWORD[160+r8],rbp + mov QWORD[144+r8],rbx + + lea rsi,[((-216))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + DB 0F3h,0C3h ;repret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_rsaz_1024_sqr_avx2 wrt ..imagebase + DD $L$SEH_end_rsaz_1024_sqr_avx2 wrt ..imagebase + DD $L$SEH_info_rsaz_1024_sqr_avx2 wrt ..imagebase + + DD $L$SEH_begin_rsaz_1024_mul_avx2 wrt ..imagebase + DD $L$SEH_end_rsaz_1024_mul_avx2 wrt ..imagebase + DD $L$SEH_info_rsaz_1024_mul_avx2 wrt ..imagebase + + DD $L$SEH_begin_rsaz_1024_gather5 wrt ..imagebase + DD $L$SEH_end_rsaz_1024_gather5 wrt ..imagebase + DD $L$SEH_info_rsaz_1024_gather5 wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_rsaz_1024_sqr_avx2: +DB 9,0,0,0 + DD rsaz_se_handler wrt ..imagebase + DD $L$sqr_1024_body wrt ..imagebase,$L$sqr_1024_epilogue wrt ..imagebase,$L$sqr_1024_in_tail wrt ..imagebase + DD 0 +$L$SEH_info_rsaz_1024_mul_avx2: +DB 9,0,0,0 + DD rsaz_se_handler wrt ..imagebase + DD $L$mul_1024_body wrt ..imagebase,$L$mul_1024_epilogue wrt ..imagebase,$L$mul_1024_in_tail wrt ..imagebase + DD 0 +$L$SEH_info_rsaz_1024_gather5: +DB 0x01,0x36,0x17,0x0b +DB 0x36,0xf8,0x09,0x00 +DB 0x31,0xe8,0x08,0x00 +DB 0x2c,0xd8,0x07,0x00 +DB 0x27,0xc8,0x06,0x00 +DB 0x22,0xb8,0x05,0x00 +DB 0x1d,0xa8,0x04,0x00 +DB 0x18,0x98,0x03,0x00 +DB 0x13,0x88,0x02,0x00 +DB 0x0e,0x78,0x01,0x00 +DB 0x09,0x68,0x00,0x00 +DB 0x04,0x01,0x15,0x00 +DB 0x00,0xb3,0x00,0x00 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha1-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm similarity index 96% rename from packager/third_party/boringssl/win-x86_64/crypto/sha/sha1-x86_64.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm index 168f78db3f..65b040fb43 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha1-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha1-x86_64.asm @@ -19,9 +19,10 @@ $L$SEH_begin_sha1_block_data_order: mov rdx,r8 - mov r9d,DWORD[((OPENSSL_ia32cap_P+0))] - mov r8d,DWORD[((OPENSSL_ia32cap_P+4))] - mov r10d,DWORD[((OPENSSL_ia32cap_P+8))] + lea r10,[OPENSSL_ia32cap_P] + mov r9d,DWORD[r10] + mov r8d,DWORD[4+r10] + mov r10d,DWORD[8+r10] test r8d,512 jz NEAR $L$ialu and r8d,268435456 @@ -1263,21 +1264,20 @@ $L$SEH_begin_sha1_block_data_order_ssse3: _ssse3_shortcut: - mov rax,rsp + mov r11,rsp push rbx push rbp push r12 push r13 push r14 lea rsp,[((-160))+rsp] - movaps XMMWORD[(-40-96)+rax],xmm6 - movaps XMMWORD[(-40-80)+rax],xmm7 - movaps XMMWORD[(-40-64)+rax],xmm8 - movaps XMMWORD[(-40-48)+rax],xmm9 - movaps XMMWORD[(-40-32)+rax],xmm10 - movaps XMMWORD[(-40-16)+rax],xmm11 + movaps XMMWORD[(-40-96)+r11],xmm6 + movaps XMMWORD[(-40-80)+r11],xmm7 + movaps XMMWORD[(-40-64)+r11],xmm8 + movaps XMMWORD[(-40-48)+r11],xmm9 + movaps XMMWORD[(-40-32)+r11],xmm10 + movaps XMMWORD[(-40-16)+r11],xmm11 $L$prologue_ssse3: - mov r14,rax and rsp,-64 mov r8,rdi mov r9,rsi @@ -1285,7 +1285,7 @@ $L$prologue_ssse3: shl r10,6 add r10,r9 - lea r11,[((K_XX_XX+64))] + lea r14,[((K_XX_XX+64))] mov eax,DWORD[r8] mov ebx,DWORD[4+r8] @@ -1297,8 +1297,8 @@ $L$prologue_ssse3: xor edi,edx and esi,edi - movdqa xmm6,XMMWORD[64+r11] - movdqa xmm9,XMMWORD[((-64))+r11] + movdqa xmm6,XMMWORD[64+r14] + movdqa xmm9,XMMWORD[((-64))+r14] movdqu xmm0,XMMWORD[r9] movdqu xmm1,XMMWORD[16+r9] movdqu xmm2,XMMWORD[32+r9] @@ -1374,7 +1374,7 @@ $L$oop_ssse3: pslld xmm9,2 pxor xmm4,xmm10 xor edx,ebp - movdqa xmm10,XMMWORD[((-64))+r11] + movdqa xmm10,XMMWORD[((-64))+r14] rol ecx,5 add ebx,edi and esi,edx @@ -1435,7 +1435,7 @@ $L$oop_ssse3: pslld xmm10,2 pxor xmm5,xmm8 xor ebp,eax - movdqa xmm8,XMMWORD[((-32))+r11] + movdqa xmm8,XMMWORD[((-32))+r14] rol edx,5 add ecx,edi and esi,ebp @@ -1496,7 +1496,7 @@ $L$oop_ssse3: pslld xmm8,2 pxor xmm6,xmm9 xor eax,ebx - movdqa xmm9,XMMWORD[((-32))+r11] + movdqa xmm9,XMMWORD[((-32))+r14] rol ebp,5 add edx,edi and esi,eax @@ -1557,7 +1557,7 @@ $L$oop_ssse3: pslld xmm9,2 pxor xmm7,xmm10 xor ebx,ecx - movdqa xmm10,XMMWORD[((-32))+r11] + movdqa xmm10,XMMWORD[((-32))+r14] rol eax,5 add ebp,edi and esi,ebx @@ -1668,7 +1668,7 @@ $L$oop_ssse3: pxor xmm2,xmm3 add eax,esi xor edi,edx - movdqa xmm10,XMMWORD[r11] + movdqa xmm10,XMMWORD[r14] ror ecx,7 paddd xmm9,xmm1 add eax,ebx @@ -1903,7 +1903,7 @@ $L$oop_ssse3: pxor xmm7,xmm0 rol ebx,5 add eax,esi - movdqa xmm9,XMMWORD[32+r11] + movdqa xmm9,XMMWORD[32+r14] xor edi,ecx paddd xmm8,xmm6 xor ecx,edx @@ -2194,8 +2194,8 @@ $L$oop_ssse3: add ecx,edx cmp r9,r10 je NEAR $L$done_ssse3 - movdqa xmm6,XMMWORD[64+r11] - movdqa xmm9,XMMWORD[((-64))+r11] + movdqa xmm6,XMMWORD[64+r14] + movdqa xmm9,XMMWORD[((-64))+r14] movdqu xmm0,XMMWORD[r9] movdqu xmm1,XMMWORD[16+r9] movdqu xmm2,XMMWORD[32+r9] @@ -2432,19 +2432,18 @@ $L$done_ssse3: mov DWORD[8+r8],ecx mov DWORD[12+r8],edx mov DWORD[16+r8],ebp - movaps xmm6,XMMWORD[((-40-96))+r14] - movaps xmm7,XMMWORD[((-40-80))+r14] - movaps xmm8,XMMWORD[((-40-64))+r14] - movaps xmm9,XMMWORD[((-40-48))+r14] - movaps xmm10,XMMWORD[((-40-32))+r14] - movaps xmm11,XMMWORD[((-40-16))+r14] - lea rsi,[r14] - mov r14,QWORD[((-40))+rsi] - mov r13,QWORD[((-32))+rsi] - mov r12,QWORD[((-24))+rsi] - mov rbp,QWORD[((-16))+rsi] - mov rbx,QWORD[((-8))+rsi] - lea rsp,[rsi] + movaps xmm6,XMMWORD[((-40-96))+r11] + movaps xmm7,XMMWORD[((-40-80))+r11] + movaps xmm8,XMMWORD[((-40-64))+r11] + movaps xmm9,XMMWORD[((-40-48))+r11] + movaps xmm10,XMMWORD[((-40-32))+r11] + movaps xmm11,XMMWORD[((-40-16))+r11] + mov r14,QWORD[((-40))+r11] + mov r13,QWORD[((-32))+r11] + mov r12,QWORD[((-24))+r11] + mov rbp,QWORD[((-16))+r11] + mov rbx,QWORD[((-8))+r11] + lea rsp,[r11] $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -2463,7 +2462,7 @@ $L$SEH_begin_sha1_block_data_order_avx: _avx_shortcut: - mov rax,rsp + mov r11,rsp push rbx push rbp push r12 @@ -2471,14 +2470,13 @@ _avx_shortcut: push r14 lea rsp,[((-160))+rsp] vzeroupper - vmovaps XMMWORD[(-40-96)+rax],xmm6 - vmovaps XMMWORD[(-40-80)+rax],xmm7 - vmovaps XMMWORD[(-40-64)+rax],xmm8 - vmovaps XMMWORD[(-40-48)+rax],xmm9 - vmovaps XMMWORD[(-40-32)+rax],xmm10 - vmovaps XMMWORD[(-40-16)+rax],xmm11 + vmovaps XMMWORD[(-40-96)+r11],xmm6 + vmovaps XMMWORD[(-40-80)+r11],xmm7 + vmovaps XMMWORD[(-40-64)+r11],xmm8 + vmovaps XMMWORD[(-40-48)+r11],xmm9 + vmovaps XMMWORD[(-40-32)+r11],xmm10 + vmovaps XMMWORD[(-40-16)+r11],xmm11 $L$prologue_avx: - mov r14,rax and rsp,-64 mov r8,rdi mov r9,rsi @@ -2486,7 +2484,7 @@ $L$prologue_avx: shl r10,6 add r10,r9 - lea r11,[((K_XX_XX+64))] + lea r14,[((K_XX_XX+64))] mov eax,DWORD[r8] mov ebx,DWORD[4+r8] @@ -2498,8 +2496,8 @@ $L$prologue_avx: xor edi,edx and esi,edi - vmovdqa xmm6,XMMWORD[64+r11] - vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqa xmm6,XMMWORD[64+r14] + vmovdqa xmm11,XMMWORD[((-64))+r14] vmovdqu xmm0,XMMWORD[r9] vmovdqu xmm1,XMMWORD[16+r9] vmovdqu xmm2,XMMWORD[32+r9] @@ -2624,7 +2622,7 @@ $L$oop_avx: vpxor xmm5,xmm5,xmm10 xor ebp,eax shld edx,edx,5 - vmovdqa xmm11,XMMWORD[((-32))+r11] + vmovdqa xmm11,XMMWORD[((-32))+r14] add ecx,edi and esi,ebp xor ebp,eax @@ -2837,7 +2835,7 @@ $L$oop_avx: add eax,esi xor edi,edx vpaddd xmm9,xmm11,xmm1 - vmovdqa xmm11,XMMWORD[r11] + vmovdqa xmm11,XMMWORD[r14] shrd ecx,ecx,7 add eax,ebx vpxor xmm2,xmm2,xmm8 @@ -3056,7 +3054,7 @@ $L$oop_avx: mov edi,ebx xor esi,edx vpaddd xmm9,xmm11,xmm6 - vmovdqa xmm11,XMMWORD[32+r11] + vmovdqa xmm11,XMMWORD[32+r14] shld ebx,ebx,5 add eax,esi vpxor xmm7,xmm7,xmm8 @@ -3335,8 +3333,8 @@ $L$oop_avx: add ecx,edx cmp r9,r10 je NEAR $L$done_avx - vmovdqa xmm6,XMMWORD[64+r11] - vmovdqa xmm11,XMMWORD[((-64))+r11] + vmovdqa xmm6,XMMWORD[64+r14] + vmovdqa xmm11,XMMWORD[((-64))+r14] vmovdqu xmm0,XMMWORD[r9] vmovdqu xmm1,XMMWORD[16+r9] vmovdqu xmm2,XMMWORD[32+r9] @@ -3572,19 +3570,18 @@ $L$done_avx: mov DWORD[8+r8],ecx mov DWORD[12+r8],edx mov DWORD[16+r8],ebp - movaps xmm6,XMMWORD[((-40-96))+r14] - movaps xmm7,XMMWORD[((-40-80))+r14] - movaps xmm8,XMMWORD[((-40-64))+r14] - movaps xmm9,XMMWORD[((-40-48))+r14] - movaps xmm10,XMMWORD[((-40-32))+r14] - movaps xmm11,XMMWORD[((-40-16))+r14] - lea rsi,[r14] - mov r14,QWORD[((-40))+rsi] - mov r13,QWORD[((-32))+rsi] - mov r12,QWORD[((-24))+rsi] - mov rbp,QWORD[((-16))+rsi] - mov rbx,QWORD[((-8))+rsi] - lea rsp,[rsi] + movaps xmm6,XMMWORD[((-40-96))+r11] + movaps xmm7,XMMWORD[((-40-80))+r11] + movaps xmm8,XMMWORD[((-40-64))+r11] + movaps xmm9,XMMWORD[((-40-48))+r11] + movaps xmm10,XMMWORD[((-40-32))+r11] + movaps xmm11,XMMWORD[((-40-16))+r11] + mov r14,QWORD[((-40))+r11] + mov r13,QWORD[((-32))+r11] + mov r12,QWORD[((-24))+r11] + mov rbp,QWORD[((-16))+r11] + mov rbx,QWORD[((-8))+r11] + lea rsp,[r11] $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -3677,15 +3674,13 @@ ssse3_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail - mov rax,QWORD[152+r8] + mov rax,QWORD[208+r8] mov r10d,DWORD[4+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail - mov rax,QWORD[232+r8] - lea rsi,[((-40-96))+rax] lea rdi,[512+r8] mov ecx,12 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha256-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm similarity index 98% rename from packager/third_party/boringssl/win-x86_64/crypto/sha/sha256-x86_64.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm index efaf9b55fc..6e3d1541ac 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha256-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha256-x86_64.asm @@ -30,13 +30,13 @@ $L$SEH_begin_sha256_block_data_order: je NEAR $L$avx_shortcut test r10d,512 jnz NEAR $L$ssse3_shortcut + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,16*4+4*8 lea rdx,[rdx*4+rsi] @@ -44,7 +44,7 @@ $L$SEH_begin_sha256_block_data_order: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],r11 + mov QWORD[((64+24))+rsp],rax $L$prologue: mov eax,DWORD[rdi] @@ -1709,13 +1709,13 @@ $L$rounds_16_xx: jb NEAR $L$loop mov rsi,QWORD[((64+24))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -1781,13 +1781,13 @@ $L$SEH_begin_sha256_block_data_order_ssse3: $L$ssse3_shortcut: + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] @@ -1795,7 +1795,7 @@ $L$ssse3_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],r11 + mov QWORD[((64+24))+rsp],rax movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -2870,13 +2870,13 @@ DB 102,15,58,15,249,4 movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue_ssse3: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -2895,13 +2895,13 @@ $L$SEH_begin_sha256_block_data_order_avx: $L$avx_shortcut: + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,160 lea rdx,[rdx*4+rsi] @@ -2909,7 +2909,7 @@ $L$avx_shortcut: mov QWORD[((64+0))+rsp],rdi mov QWORD[((64+8))+rsp],rsi mov QWORD[((64+16))+rsp],rdx - mov QWORD[((64+24))+rsp],r11 + mov QWORD[((64+24))+rsp],rax movaps XMMWORD[(64+32)+rsp],xmm6 movaps XMMWORD[(64+48)+rsp],xmm7 movaps XMMWORD[(64+64)+rsp],xmm8 @@ -3946,13 +3946,13 @@ $L$avx_00_47: movaps xmm7,XMMWORD[((64+48))+rsp] movaps xmm8,XMMWORD[((64+64))+rsp] movaps xmm9,XMMWORD[((64+80))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -3992,7 +3992,6 @@ se_handler: jae NEAR $L$in_prologue mov rsi,rax mov rax,QWORD[((64+24))+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] diff --git a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha512-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm similarity index 98% rename from packager/third_party/boringssl/win-x86_64/crypto/sha/sha512-x86_64.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm index 71449cd24f..d0d7a43fbe 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/sha/sha512-x86_64.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/sha512-x86_64.asm @@ -30,13 +30,13 @@ $L$SEH_begin_sha512_block_data_order: or r10d,r9d cmp r10d,1342177792 je NEAR $L$avx_shortcut + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,16*8+4*8 lea rdx,[rdx*8+rsi] @@ -44,7 +44,7 @@ $L$SEH_begin_sha512_block_data_order: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],r11 + mov QWORD[((128+24))+rsp],rax $L$prologue: mov rax,QWORD[rdi] @@ -1709,13 +1709,13 @@ $L$rounds_16_xx: jb NEAR $L$loop mov rsi,QWORD[((128+24))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -1825,13 +1825,13 @@ $L$SEH_begin_sha512_block_data_order_xop: $L$xop_shortcut: + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,256 lea rdx,[rdx*8+rsi] @@ -1839,7 +1839,7 @@ $L$xop_shortcut: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],r11 + mov QWORD[((128+24))+rsp],rax movaps XMMWORD[(128+32)+rsp],xmm6 movaps XMMWORD[(128+48)+rsp],xmm7 movaps XMMWORD[(128+64)+rsp],xmm8 @@ -2906,13 +2906,13 @@ DB 143,72,120,195,203,42 movaps xmm9,XMMWORD[((128+80))+rsp] movaps xmm10,XMMWORD[((128+96))+rsp] movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue_xop: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -2931,13 +2931,13 @@ $L$SEH_begin_sha512_block_data_order_avx: $L$avx_shortcut: + mov rax,rsp push rbx push rbp push r12 push r13 push r14 push r15 - mov r11,rsp shl rdx,4 sub rsp,256 lea rdx,[rdx*8+rsi] @@ -2945,7 +2945,7 @@ $L$avx_shortcut: mov QWORD[((128+0))+rsp],rdi mov QWORD[((128+8))+rsp],rsi mov QWORD[((128+16))+rsp],rdx - mov QWORD[((128+24))+rsp],r11 + mov QWORD[((128+24))+rsp],rax movaps XMMWORD[(128+32)+rsp],xmm6 movaps XMMWORD[(128+48)+rsp],xmm7 movaps XMMWORD[(128+64)+rsp],xmm8 @@ -4076,13 +4076,13 @@ $L$avx_00_47: movaps xmm9,XMMWORD[((128+80))+rsp] movaps xmm10,XMMWORD[((128+96))+rsp] movaps xmm11,XMMWORD[((128+112))+rsp] - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] $L$epilogue_avx: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] @@ -4122,7 +4122,6 @@ se_handler: jae NEAR $L$in_prologue mov rsi,rax mov rax,QWORD[((128+24))+rax] - lea rax,[48+rax] mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] diff --git a/packager/third_party/boringssl/win-x86_64/crypto/aes/vpaes-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm similarity index 100% rename from packager/third_party/boringssl/win-x86_64/crypto/aes/vpaes-x86_64.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm diff --git a/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm similarity index 84% rename from packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm index 4d8e1cb72a..dd93341d8f 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont.asm @@ -23,6 +23,10 @@ $L$SEH_begin_bn_mul_mont: mov r9,QWORD[48+rsp] + + mov r9d,r9d + mov rax,rsp + test r9d,3 jnz NEAR $L$mul_enter cmp r9d,8 @@ -36,20 +40,50 @@ $L$SEH_begin_bn_mul_mont: ALIGN 16 $L$mul_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - mov r9d,r9d - lea r10,[2+r9] - mov r11,rsp - neg r10 - lea rsp,[r10*8+rsp] - and rsp,-1024 - mov QWORD[8+r9*8+rsp],r11 + neg r9 + mov r11,rsp + lea r10,[((-16))+r9*8+rsp] + neg r9 + and r10,-1024 + + + + + + + + + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk + jmp NEAR $L$mul_page_walk_done + +ALIGN 16 +$L$mul_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk +$L$mul_page_walk_done: + + mov QWORD[8+r9*8+rsp],rax + $L$mul_body: mov r12,rdx mov r8,QWORD[r8] @@ -192,7 +226,8 @@ $L$inner_enter: mov r15,r9 jmp NEAR $L$sub ALIGN 16 -$L$sub: sbb rax,QWORD[r14*8+rcx] +$L$sub: + sbb rax,QWORD[r14*8+rcx] mov QWORD[r14*8+rdi],rax mov rax,QWORD[8+r14*8+rsi] lea r14,[1+r14] @@ -201,33 +236,43 @@ $L$sub: sbb rax,QWORD[r14*8+rcx] sbb rax,0 xor r14,r14 + and rsi,rax + not rax + mov rcx,rdi + and rcx,rax mov r15,r9 + or rsi,rcx ALIGN 16 $L$copy: - mov rsi,QWORD[r14*8+rsp] - mov rcx,QWORD[r14*8+rdi] - xor rsi,rcx - and rsi,rax - xor rsi,rcx + mov rax,QWORD[r14*8+rsi] mov QWORD[r14*8+rsp],r14 - mov QWORD[r14*8+rdi],rsi + mov QWORD[r14*8+rdi],rax lea r14,[1+r14] sub r15,1 jnz NEAR $L$copy mov rsi,QWORD[8+r9*8+rsp] + mov rax,1 - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul_mont: ALIGN 16 @@ -244,22 +289,47 @@ $L$SEH_begin_bn_mul4x_mont: mov r9,QWORD[48+rsp] -$L$mul4x_enter: - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 mov r9d,r9d - lea r10,[4+r9] - mov r11,rsp - neg r10 - lea rsp,[r10*8+rsp] - and rsp,-1024 + mov rax,rsp + +$L$mul4x_enter: + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + neg r9 + mov r11,rsp + lea r10,[((-32))+r9*8+rsp] + neg r9 + and r10,-1024 + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul4x_page_walk + jmp NEAR $L$mul4x_page_walk_done + +$L$mul4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul4x_page_walk +$L$mul4x_page_walk_done: + + mov QWORD[8+r9*8+rsp],rax - mov QWORD[8+r9*8+rsp],r11 $L$mul4x_body: mov QWORD[16+r9*8+rsp],rdi mov r12,rdx @@ -559,9 +629,11 @@ $L$inner4x: cmp r14,r9 jb NEAR $L$outer4x mov rdi,QWORD[16+r9*8+rsp] + lea r15,[((-4))+r9] mov rax,QWORD[rsp] + pxor xmm0,xmm0 mov rdx,QWORD[8+rsp] - shr r9,2 + shr r15,2 lea rsi,[rsp] xor r14,r14 @@ -569,7 +641,6 @@ $L$inner4x: mov rbx,QWORD[16+rsi] mov rbp,QWORD[24+rsi] sbb rdx,QWORD[8+rcx] - lea r15,[((-1))+r9] jmp NEAR $L$sub4x ALIGN 16 $L$sub4x: @@ -597,49 +668,57 @@ $L$sub4x: mov QWORD[16+r14*8+rdi],rbx sbb rax,0 -DB 66h, 48h, 0fh, 6eh, 0c0h - punpcklqdq xmm0,xmm0 mov QWORD[24+r14*8+rdi],rbp xor r14,r14 + and rsi,rax + not rax + mov rcx,rdi + and rcx,rax + lea r15,[((-4))+r9] + or rsi,rcx + shr r15,2 - mov r15,r9 - pxor xmm5,xmm5 + movdqu xmm1,XMMWORD[rsi] + movdqa XMMWORD[rsp],xmm0 + movdqu XMMWORD[rdi],xmm1 jmp NEAR $L$copy4x ALIGN 16 $L$copy4x: - movdqu xmm2,XMMWORD[r14*1+rsp] - movdqu xmm4,XMMWORD[16+r14*1+rsp] - movdqu xmm1,XMMWORD[r14*1+rdi] - movdqu xmm3,XMMWORD[16+r14*1+rdi] - pxor xmm2,xmm1 - pxor xmm4,xmm3 - pand xmm2,xmm0 - pand xmm4,xmm0 - pxor xmm2,xmm1 - pxor xmm4,xmm3 - movdqu XMMWORD[r14*1+rdi],xmm2 - movdqu XMMWORD[16+r14*1+rdi],xmm4 - movdqa XMMWORD[r14*1+rsp],xmm5 - movdqa XMMWORD[16+r14*1+rsp],xmm5 - + movdqu xmm2,XMMWORD[16+r14*1+rsi] + movdqu xmm1,XMMWORD[32+r14*1+rsi] + movdqa XMMWORD[16+r14*1+rsp],xmm0 + movdqu XMMWORD[16+r14*1+rdi],xmm2 + movdqa XMMWORD[32+r14*1+rsp],xmm0 + movdqu XMMWORD[32+r14*1+rdi],xmm1 lea r14,[32+r14] dec r15 jnz NEAR $L$copy4x - shl r9,2 + movdqu xmm2,XMMWORD[16+r14*1+rsi] + movdqa XMMWORD[16+r14*1+rsp],xmm0 + movdqu XMMWORD[16+r14*1+rdi],xmm2 mov rsi,QWORD[8+r9*8+rsp] + mov rax,1 - mov r15,QWORD[rsi] - mov r14,QWORD[8+rsi] - mov r13,QWORD[16+rsi] - mov r12,QWORD[24+rsi] - mov rbp,QWORD[32+rsi] - mov rbx,QWORD[40+rsi] - lea rsp,[48+rsi] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + $L$mul4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul4x_mont: EXTERN bn_sqr8x_internal @@ -658,15 +737,24 @@ $L$SEH_begin_bn_sqr8x_mont: mov r9,QWORD[48+rsp] -$L$sqr8x_enter: + mov rax,rsp + +$L$sqr8x_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$sqr8x_prologue: + mov r10d,r9d shl r9d,3 shl r10,3+2 @@ -678,30 +766,49 @@ $L$sqr8x_enter: lea r11,[((-64))+r9*2+rsp] + mov rbp,rsp mov r8,QWORD[r8] sub r11,rsi and r11,4095 cmp r10,r11 jb NEAR $L$sqr8x_sp_alt - sub rsp,r11 - lea rsp,[((-64))+r9*2+rsp] + sub rbp,r11 + lea rbp,[((-64))+r9*2+rbp] jmp NEAR $L$sqr8x_sp_done ALIGN 32 $L$sqr8x_sp_alt: lea r10,[((4096-64))+r9*2] - lea rsp,[((-64))+r9*2+rsp] + lea rbp,[((-64))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 - sub rsp,r11 + sub rbp,r11 $L$sqr8x_sp_done: - and rsp,-64 + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$sqr8x_page_walk + jmp NEAR $L$sqr8x_page_walk_done + +ALIGN 16 +$L$sqr8x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$sqr8x_page_walk +$L$sqr8x_page_walk_done: + mov r10,r9 neg r9 mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$sqr8x_body: DB 102,72,15,110,209 @@ -748,6 +855,7 @@ DB 102,72,15,110,200 pxor xmm0,xmm0 pshufd xmm1,xmm1,0 mov rsi,QWORD[40+rsp] + jmp NEAR $L$sqr8x_cond_copy ALIGN 32 @@ -777,16 +885,24 @@ $L$sqr8x_cond_copy: mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$sqr8x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_sqr8x_mont: DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 @@ -829,22 +945,8 @@ mul_handler: mov r10,QWORD[192+r8] mov rax,QWORD[8+r10*8+rax] - lea rax,[48+rax] - mov rbx,QWORD[((-8))+rax] - mov rbp,QWORD[((-16))+rax] - mov r12,QWORD[((-24))+rax] - mov r13,QWORD[((-32))+rax] - mov r14,QWORD[((-40))+rax] - mov r15,QWORD[((-48))+rax] - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - - jmp NEAR $L$common_seh_tail + jmp NEAR $L$common_pop_regs @@ -872,15 +974,21 @@ sqr_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_pop_regs + mov rax,QWORD[152+r8] - mov r10d,DWORD[4+r11] + mov r10d,DWORD[8+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail mov rax,QWORD[40+rax] +$L$common_pop_regs: mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] @@ -960,4 +1068,5 @@ DB 9,0,0,0 $L$SEH_info_bn_sqr8x_mont: DB 9,0,0,0 DD sqr_handler wrt ..imagebase - DD $L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase + DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase +ALIGN 8 diff --git a/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm similarity index 94% rename from packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm rename to packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm index cd9a6e5d4e..1bcbc5d097 100644 --- a/packager/third_party/boringssl/win-x86_64/crypto/bn/x86_64-mont5.asm +++ b/packager/third_party/boringssl/win-x86_64/crypto/fipsmodule/x86_64-mont5.asm @@ -23,30 +23,64 @@ $L$SEH_begin_bn_mul_mont_gather5: mov r9,QWORD[48+rsp] + + mov r9d,r9d + mov rax,rsp + test r9d,7 jnz NEAR $L$mul_enter jmp NEAR $L$mul4x_enter ALIGN 16 $L$mul_enter: - mov r9d,r9d - mov rax,rsp movd xmm5,DWORD[56+rsp] - lea r10,[$L$inc] push rbx + push rbp + push r12 + push r13 + push r14 + push r15 - lea r11,[2+r9] - neg r11 - lea rsp,[((-264))+r11*8+rsp] - and rsp,-1024 + neg r9 + mov r11,rsp + lea r10,[((-280))+r9*8+rsp] + neg r9 + and r10,-1024 + + + + + + + + + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk + jmp NEAR $L$mul_page_walk_done + +$L$mul_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk +$L$mul_page_walk_done: + + lea r10,[$L$inc] mov QWORD[8+r9*8+rsp],rax + $L$mul_body: + lea r12,[128+rdx] movdqa xmm0,XMMWORD[r10] movdqa xmm1,XMMWORD[16+r10] @@ -376,7 +410,8 @@ $L$inner_enter: mov r15,r9 jmp NEAR $L$sub ALIGN 16 -$L$sub: sbb rax,QWORD[r14*8+rcx] +$L$sub: + sbb rax,QWORD[r14*8+rcx] mov QWORD[r14*8+rdi],rax mov rax,QWORD[8+r14*8+rsi] lea r14,[1+r14] @@ -385,34 +420,44 @@ $L$sub: sbb rax,QWORD[r14*8+rcx] sbb rax,0 xor r14,r14 + and rsi,rax + not rax + mov rcx,rdi + and rcx,rax mov r15,r9 + or rsi,rcx ALIGN 16 $L$copy: - mov rsi,QWORD[r14*8+rsp] - mov rcx,QWORD[r14*8+rdi] - xor rsi,rcx - and rsi,rax - xor rsi,rcx + mov rax,QWORD[r14*8+rsi] mov QWORD[r14*8+rsp],r14 - mov QWORD[r14*8+rdi],rsi + mov QWORD[r14*8+rdi],rax lea r14,[1+r14] sub r15,1 jnz NEAR $L$copy mov rsi,QWORD[8+r9*8+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$mul_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul_mont_gather5: ALIGN 32 @@ -429,16 +474,25 @@ $L$SEH_begin_bn_mul4x_mont_gather5: mov r9,QWORD[48+rsp] -$L$mul4x_enter: + DB 0x67 mov rax,rsp + +$L$mul4x_enter: push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$mul4x_prologue: + DB 0x67 shl r9d,3 lea r10,[r9*2+r9] @@ -454,45 +508,72 @@ DB 0x67 lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$mul4xsp_alt - sub rsp,r11 - lea rsp,[((-320))+r9*2+rsp] + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] jmp NEAR $L$mul4xsp_done ALIGN 32 $L$mul4xsp_alt: lea r10,[((4096-320))+r9*2] - lea rsp,[((-320))+r9*2+rsp] + lea rbp,[((-320))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 - sub rsp,r11 + sub rbp,r11 $L$mul4xsp_done: - and rsp,-64 + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mul4x_page_walk + jmp NEAR $L$mul4x_page_walk_done + +$L$mul4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mul4x_page_walk +$L$mul4x_page_walk_done: + neg r9 mov QWORD[40+rsp],rax + $L$mul4x_body: call mul4x_internal mov rsi,QWORD[40+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$mul4x_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_mul4x_mont_gather5: @@ -1036,14 +1117,23 @@ $L$SEH_begin_bn_power5: mov r9,QWORD[48+rsp] + mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$power5_prologue: + shl r9d,3 lea r10d,[r9*2+r9] neg r9 @@ -1057,24 +1147,41 @@ $L$SEH_begin_bn_power5: lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$pwr_sp_alt - sub rsp,r11 - lea rsp,[((-320))+r9*2+rsp] + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] jmp NEAR $L$pwr_sp_done ALIGN 32 $L$pwr_sp_alt: lea r10,[((4096-320))+r9*2] - lea rsp,[((-320))+r9*2+rsp] + lea rbp,[((-320))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 - sub rsp,r11 + sub rbp,r11 $L$pwr_sp_done: - and rsp,-64 + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwr_page_walk + jmp NEAR $L$pwr_page_walk_done + +$L$pwr_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwr_page_walk +$L$pwr_page_walk_done: + mov r10,r9 neg r9 @@ -1089,6 +1196,7 @@ $L$pwr_sp_done: mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$power5_body: DB 102,72,15,110,207 DB 102,72,15,110,209 @@ -1115,18 +1223,27 @@ DB 102,72,15,126,226 call mul4x_internal mov rsi,QWORD[40+rsp] + mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$power5_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_power5: global bn_sqr8x_internal @@ -1868,6 +1985,7 @@ $L$8x_tail: ALIGN 32 $L$8x_tail_done: + xor rax,rax add r8,QWORD[rdx] adc r9,0 adc r10,0 @@ -1876,9 +1994,7 @@ $L$8x_tail_done: adc r13,0 adc r14,0 adc r15,0 - - - xor rax,rax + adc rax,0 neg rsi $L$8x_no_tail: @@ -1990,15 +2106,24 @@ $L$SEH_begin_bn_from_mont8x: mov r9,QWORD[48+rsp] + DB 0x67 mov rax,rsp + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 +$L$from_prologue: + shl r9d,3 lea r10,[r9*2+r9] neg r9 @@ -2012,24 +2137,41 @@ DB 0x67 lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp sub r11,rdi and r11,4095 cmp r10,r11 jb NEAR $L$from_sp_alt - sub rsp,r11 - lea rsp,[((-320))+r9*2+rsp] + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] jmp NEAR $L$from_sp_done ALIGN 32 $L$from_sp_alt: lea r10,[((4096-320))+r9*2] - lea rsp,[((-320))+r9*2+rsp] + lea rbp,[((-320))+r9*2+rbp] sub r11,r10 mov r10,0 cmovc r11,r10 - sub rsp,r11 + sub rbp,r11 $L$from_sp_done: - and rsp,-64 + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$from_page_walk + jmp NEAR $L$from_page_walk_done + +$L$from_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$from_page_walk +$L$from_page_walk_done: + mov r10,r9 neg r9 @@ -2044,6 +2186,7 @@ $L$from_sp_done: mov QWORD[32+rsp],r8 mov QWORD[40+rsp],rax + $L$from_body: mov r11,r9 lea rax,[48+rsp] @@ -2079,11 +2222,12 @@ DB 102,73,15,110,218 pxor xmm0,xmm0 lea rax,[48+rsp] - mov rsi,QWORD[40+rsp] jmp NEAR $L$from_mont_zero ALIGN 32 $L$from_mont_zero: + mov rsi,QWORD[40+rsp] + movdqa XMMWORD[rax],xmm0 movdqa XMMWORD[16+rax],xmm0 movdqa XMMWORD[32+rax],xmm0 @@ -2094,16 +2238,24 @@ $L$from_mont_zero: mov rax,1 mov r15,QWORD[((-48))+rsi] + mov r14,QWORD[((-40))+rsi] + mov r13,QWORD[((-32))+rsi] + mov r12,QWORD[((-24))+rsi] + mov rbp,QWORD[((-16))+rsi] + mov rbx,QWORD[((-8))+rsi] + lea rsp,[rsi] + $L$from_epilogue: mov rdi,QWORD[8+rsp] ;WIN64 epilogue mov rsi,QWORD[16+rsp] DB 0F3h,0C3h ;repret + $L$SEH_end_bn_from_mont8x: global bn_scatter5 @@ -2322,9 +2474,14 @@ mul_handler: cmp rbx,r10 jb NEAR $L$common_seh_tail + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_pop_regs + mov rax,QWORD[152+r8] - mov r10d,DWORD[4+r11] + mov r10d,DWORD[8+r11] lea r10,[r10*1+rsi] cmp rbx,r10 jae NEAR $L$common_seh_tail @@ -2336,11 +2493,11 @@ mul_handler: mov r10,QWORD[192+r8] mov rax,QWORD[8+r10*8+rax] - jmp NEAR $L$body_proceed + jmp NEAR $L$common_pop_regs $L$body_40: mov rax,QWORD[40+rax] -$L$body_proceed: +$L$common_pop_regs: mov rbx,QWORD[((-8))+rax] mov rbp,QWORD[((-16))+rax] mov r12,QWORD[((-24))+rax] @@ -2420,22 +2577,22 @@ ALIGN 8 $L$SEH_info_bn_mul_mont_gather5: DB 9,0,0,0 DD mul_handler wrt ..imagebase - DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_mul4x_mont_gather5: DB 9,0,0,0 DD mul_handler wrt ..imagebase - DD $L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase + DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_power5: DB 9,0,0,0 DD mul_handler wrt ..imagebase - DD $L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase + DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_from_mont8x: DB 9,0,0,0 DD mul_handler wrt ..imagebase - DD $L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase + DD $L$from_prologue wrt ..imagebase,$L$from_body wrt ..imagebase,$L$from_epilogue wrt ..imagebase ALIGN 8 $L$SEH_info_bn_gather5: DB 0x01,0x0b,0x03,0x0a diff --git a/packager/third_party/boringssl/win-x86_64/crypto/modes/aesni-gcm-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/modes/aesni-gcm-x86_64.asm deleted file mode 100644 index d7fff6a974..0000000000 --- a/packager/third_party/boringssl/win-x86_64/crypto/modes/aesni-gcm-x86_64.asm +++ /dev/null @@ -1,20 +0,0 @@ -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD -section .text code align=64 - - -global aesni_gcm_encrypt - -aesni_gcm_encrypt: - xor eax,eax - DB 0F3h,0C3h ;repret - - -global aesni_gcm_decrypt - -aesni_gcm_decrypt: - xor eax,eax - DB 0F3h,0C3h ;repret - diff --git a/packager/third_party/boringssl/win-x86_64/crypto/rc4/rc4-md5-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/rc4/rc4-md5-x86_64.asm deleted file mode 100644 index f1ea9652d9..0000000000 --- a/packager/third_party/boringssl/win-x86_64/crypto/rc4/rc4-md5-x86_64.asm +++ /dev/null @@ -1,1372 +0,0 @@ -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD -section .text code align=64 - -ALIGN 16 - -global rc4_md5_enc - -rc4_md5_enc: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_rc4_md5_enc: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - mov r8,QWORD[40+rsp] - mov r9,QWORD[48+rsp] - - - cmp r9,0 - je NEAR $L$abort - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - sub rsp,40 -$L$body: - mov r11,rcx - mov r12,r9 - mov r13,rsi - mov r14,rdx - mov r15,r8 - xor rbp,rbp - xor rcx,rcx - - lea rdi,[8+rdi] - mov bpl,BYTE[((-8))+rdi] - mov cl,BYTE[((-4))+rdi] - - inc bpl - sub r14,r13 - mov eax,DWORD[rbp*4+rdi] - add cl,al - lea rsi,[rbp*4+rdi] - shl r12,6 - add r12,r15 - mov QWORD[16+rsp],r12 - - mov QWORD[24+rsp],r11 - mov r8d,DWORD[r11] - mov r9d,DWORD[4+r11] - mov r10d,DWORD[8+r11] - mov r11d,DWORD[12+r11] - jmp NEAR $L$oop - -ALIGN 16 -$L$oop: - mov DWORD[rsp],r8d - mov DWORD[4+rsp],r9d - mov DWORD[8+rsp],r10d - mov r12d,r11d - mov DWORD[12+rsp],r11d - pxor xmm0,xmm0 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r8d,DWORD[r15] - add al,dl - mov ebx,DWORD[4+rsi] - add r8d,3614090360 - xor r12d,r11d - movzx eax,al - mov DWORD[rsi],edx - add r8d,r12d - add cl,bl - rol r8d,7 - mov r12d,r10d - movd xmm0,DWORD[rax*4+rdi] - - add r8d,r9d - pxor xmm1,xmm1 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r11d,DWORD[4+r15] - add bl,dl - mov eax,DWORD[8+rsi] - add r11d,3905402710 - xor r12d,r10d - movzx ebx,bl - mov DWORD[4+rsi],edx - add r11d,r12d - add cl,al - rol r11d,12 - mov r12d,r9d - movd xmm1,DWORD[rbx*4+rdi] - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r10d,DWORD[8+r15] - add al,dl - mov ebx,DWORD[12+rsi] - add r10d,606105819 - xor r12d,r9d - movzx eax,al - mov DWORD[8+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,17 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],1 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r9d,DWORD[12+r15] - add bl,dl - mov eax,DWORD[16+rsi] - add r9d,3250441966 - xor r12d,r8d - movzx ebx,bl - mov DWORD[12+rsi],edx - add r9d,r12d - add cl,al - rol r9d,22 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],1 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r8d,DWORD[16+r15] - add al,dl - mov ebx,DWORD[20+rsi] - add r8d,4118548399 - xor r12d,r11d - movzx eax,al - mov DWORD[16+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,7 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],2 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r11d,DWORD[20+r15] - add bl,dl - mov eax,DWORD[24+rsi] - add r11d,1200080426 - xor r12d,r10d - movzx ebx,bl - mov DWORD[20+rsi],edx - add r11d,r12d - add cl,al - rol r11d,12 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],2 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r10d,DWORD[24+r15] - add al,dl - mov ebx,DWORD[28+rsi] - add r10d,2821735955 - xor r12d,r9d - movzx eax,al - mov DWORD[24+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,17 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],3 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r9d,DWORD[28+r15] - add bl,dl - mov eax,DWORD[32+rsi] - add r9d,4249261313 - xor r12d,r8d - movzx ebx,bl - mov DWORD[28+rsi],edx - add r9d,r12d - add cl,al - rol r9d,22 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],3 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r8d,DWORD[32+r15] - add al,dl - mov ebx,DWORD[36+rsi] - add r8d,1770035416 - xor r12d,r11d - movzx eax,al - mov DWORD[32+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,7 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],4 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r11d,DWORD[36+r15] - add bl,dl - mov eax,DWORD[40+rsi] - add r11d,2336552879 - xor r12d,r10d - movzx ebx,bl - mov DWORD[36+rsi],edx - add r11d,r12d - add cl,al - rol r11d,12 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],4 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r10d,DWORD[40+r15] - add al,dl - mov ebx,DWORD[44+rsi] - add r10d,4294925233 - xor r12d,r9d - movzx eax,al - mov DWORD[40+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,17 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],5 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r9d,DWORD[44+r15] - add bl,dl - mov eax,DWORD[48+rsi] - add r9d,2304563134 - xor r12d,r8d - movzx ebx,bl - mov DWORD[44+rsi],edx - add r9d,r12d - add cl,al - rol r9d,22 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],5 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r8d,DWORD[48+r15] - add al,dl - mov ebx,DWORD[52+rsi] - add r8d,1804603682 - xor r12d,r11d - movzx eax,al - mov DWORD[48+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,7 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],6 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r11d,DWORD[52+r15] - add bl,dl - mov eax,DWORD[56+rsi] - add r11d,4254626195 - xor r12d,r10d - movzx ebx,bl - mov DWORD[52+rsi],edx - add r11d,r12d - add cl,al - rol r11d,12 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],6 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r10d,DWORD[56+r15] - add al,dl - mov ebx,DWORD[60+rsi] - add r10d,2792965006 - xor r12d,r9d - movzx eax,al - mov DWORD[56+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,17 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],7 - - add r10d,r11d - movdqu xmm2,XMMWORD[r13] - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r9d,DWORD[60+r15] - add bl,dl - mov eax,DWORD[64+rsi] - add r9d,1236535329 - xor r12d,r8d - movzx ebx,bl - mov DWORD[60+rsi],edx - add r9d,r12d - add cl,al - rol r9d,22 - mov r12d,r10d - pinsrw xmm1,WORD[rbx*4+rdi],7 - - add r9d,r10d - psllq xmm1,8 - pxor xmm2,xmm0 - pxor xmm2,xmm1 - pxor xmm0,xmm0 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r8d,DWORD[4+r15] - add al,dl - mov ebx,DWORD[68+rsi] - add r8d,4129170786 - xor r12d,r10d - movzx eax,al - mov DWORD[64+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,5 - mov r12d,r9d - movd xmm0,DWORD[rax*4+rdi] - - add r8d,r9d - pxor xmm1,xmm1 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r11d,DWORD[24+r15] - add bl,dl - mov eax,DWORD[72+rsi] - add r11d,3225465664 - xor r12d,r9d - movzx ebx,bl - mov DWORD[68+rsi],edx - add r11d,r12d - add cl,al - rol r11d,9 - mov r12d,r8d - movd xmm1,DWORD[rbx*4+rdi] - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r10d,DWORD[44+r15] - add al,dl - mov ebx,DWORD[76+rsi] - add r10d,643717713 - xor r12d,r8d - movzx eax,al - mov DWORD[72+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,14 - mov r12d,r11d - pinsrw xmm0,WORD[rax*4+rdi],1 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r9d,DWORD[r15] - add bl,dl - mov eax,DWORD[80+rsi] - add r9d,3921069994 - xor r12d,r11d - movzx ebx,bl - mov DWORD[76+rsi],edx - add r9d,r12d - add cl,al - rol r9d,20 - mov r12d,r10d - pinsrw xmm1,WORD[rbx*4+rdi],1 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r8d,DWORD[20+r15] - add al,dl - mov ebx,DWORD[84+rsi] - add r8d,3593408605 - xor r12d,r10d - movzx eax,al - mov DWORD[80+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,5 - mov r12d,r9d - pinsrw xmm0,WORD[rax*4+rdi],2 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r11d,DWORD[40+r15] - add bl,dl - mov eax,DWORD[88+rsi] - add r11d,38016083 - xor r12d,r9d - movzx ebx,bl - mov DWORD[84+rsi],edx - add r11d,r12d - add cl,al - rol r11d,9 - mov r12d,r8d - pinsrw xmm1,WORD[rbx*4+rdi],2 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r10d,DWORD[60+r15] - add al,dl - mov ebx,DWORD[92+rsi] - add r10d,3634488961 - xor r12d,r8d - movzx eax,al - mov DWORD[88+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,14 - mov r12d,r11d - pinsrw xmm0,WORD[rax*4+rdi],3 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r9d,DWORD[16+r15] - add bl,dl - mov eax,DWORD[96+rsi] - add r9d,3889429448 - xor r12d,r11d - movzx ebx,bl - mov DWORD[92+rsi],edx - add r9d,r12d - add cl,al - rol r9d,20 - mov r12d,r10d - pinsrw xmm1,WORD[rbx*4+rdi],3 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r8d,DWORD[36+r15] - add al,dl - mov ebx,DWORD[100+rsi] - add r8d,568446438 - xor r12d,r10d - movzx eax,al - mov DWORD[96+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,5 - mov r12d,r9d - pinsrw xmm0,WORD[rax*4+rdi],4 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r11d,DWORD[56+r15] - add bl,dl - mov eax,DWORD[104+rsi] - add r11d,3275163606 - xor r12d,r9d - movzx ebx,bl - mov DWORD[100+rsi],edx - add r11d,r12d - add cl,al - rol r11d,9 - mov r12d,r8d - pinsrw xmm1,WORD[rbx*4+rdi],4 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r10d,DWORD[12+r15] - add al,dl - mov ebx,DWORD[108+rsi] - add r10d,4107603335 - xor r12d,r8d - movzx eax,al - mov DWORD[104+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,14 - mov r12d,r11d - pinsrw xmm0,WORD[rax*4+rdi],5 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r9d,DWORD[32+r15] - add bl,dl - mov eax,DWORD[112+rsi] - add r9d,1163531501 - xor r12d,r11d - movzx ebx,bl - mov DWORD[108+rsi],edx - add r9d,r12d - add cl,al - rol r9d,20 - mov r12d,r10d - pinsrw xmm1,WORD[rbx*4+rdi],5 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - and r12d,r11d - add r8d,DWORD[52+r15] - add al,dl - mov ebx,DWORD[116+rsi] - add r8d,2850285829 - xor r12d,r10d - movzx eax,al - mov DWORD[112+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,5 - mov r12d,r9d - pinsrw xmm0,WORD[rax*4+rdi],6 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - and r12d,r10d - add r11d,DWORD[8+r15] - add bl,dl - mov eax,DWORD[120+rsi] - add r11d,4243563512 - xor r12d,r9d - movzx ebx,bl - mov DWORD[116+rsi],edx - add r11d,r12d - add cl,al - rol r11d,9 - mov r12d,r8d - pinsrw xmm1,WORD[rbx*4+rdi],6 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - and r12d,r9d - add r10d,DWORD[28+r15] - add al,dl - mov ebx,DWORD[124+rsi] - add r10d,1735328473 - xor r12d,r8d - movzx eax,al - mov DWORD[120+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,14 - mov r12d,r11d - pinsrw xmm0,WORD[rax*4+rdi],7 - - add r10d,r11d - movdqu xmm3,XMMWORD[16+r13] - add bpl,32 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - and r12d,r8d - add r9d,DWORD[48+r15] - add bl,dl - mov eax,DWORD[rbp*4+rdi] - add r9d,2368359562 - xor r12d,r11d - movzx ebx,bl - mov DWORD[124+rsi],edx - add r9d,r12d - add cl,al - rol r9d,20 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],7 - - add r9d,r10d - mov rsi,rcx - xor rcx,rcx - mov cl,sil - lea rsi,[rbp*4+rdi] - psllq xmm1,8 - pxor xmm3,xmm0 - pxor xmm3,xmm1 - pxor xmm0,xmm0 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - xor r12d,r9d - add r8d,DWORD[20+r15] - add al,dl - mov ebx,DWORD[4+rsi] - add r8d,4294588738 - movzx eax,al - add r8d,r12d - mov DWORD[rsi],edx - add cl,bl - rol r8d,4 - mov r12d,r10d - movd xmm0,DWORD[rax*4+rdi] - - add r8d,r9d - pxor xmm1,xmm1 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r8d - add r11d,DWORD[32+r15] - add bl,dl - mov eax,DWORD[8+rsi] - add r11d,2272392833 - movzx ebx,bl - add r11d,r12d - mov DWORD[4+rsi],edx - add cl,al - rol r11d,11 - mov r12d,r9d - movd xmm1,DWORD[rbx*4+rdi] - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - xor r12d,r11d - add r10d,DWORD[44+r15] - add al,dl - mov ebx,DWORD[12+rsi] - add r10d,1839030562 - movzx eax,al - add r10d,r12d - mov DWORD[8+rsi],edx - add cl,bl - rol r10d,16 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],1 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r10d - add r9d,DWORD[56+r15] - add bl,dl - mov eax,DWORD[16+rsi] - add r9d,4259657740 - movzx ebx,bl - add r9d,r12d - mov DWORD[12+rsi],edx - add cl,al - rol r9d,23 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],1 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - xor r12d,r9d - add r8d,DWORD[4+r15] - add al,dl - mov ebx,DWORD[20+rsi] - add r8d,2763975236 - movzx eax,al - add r8d,r12d - mov DWORD[16+rsi],edx - add cl,bl - rol r8d,4 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],2 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r8d - add r11d,DWORD[16+r15] - add bl,dl - mov eax,DWORD[24+rsi] - add r11d,1272893353 - movzx ebx,bl - add r11d,r12d - mov DWORD[20+rsi],edx - add cl,al - rol r11d,11 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],2 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - xor r12d,r11d - add r10d,DWORD[28+r15] - add al,dl - mov ebx,DWORD[28+rsi] - add r10d,4139469664 - movzx eax,al - add r10d,r12d - mov DWORD[24+rsi],edx - add cl,bl - rol r10d,16 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],3 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r10d - add r9d,DWORD[40+r15] - add bl,dl - mov eax,DWORD[32+rsi] - add r9d,3200236656 - movzx ebx,bl - add r9d,r12d - mov DWORD[28+rsi],edx - add cl,al - rol r9d,23 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],3 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - xor r12d,r9d - add r8d,DWORD[52+r15] - add al,dl - mov ebx,DWORD[36+rsi] - add r8d,681279174 - movzx eax,al - add r8d,r12d - mov DWORD[32+rsi],edx - add cl,bl - rol r8d,4 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],4 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r8d - add r11d,DWORD[r15] - add bl,dl - mov eax,DWORD[40+rsi] - add r11d,3936430074 - movzx ebx,bl - add r11d,r12d - mov DWORD[36+rsi],edx - add cl,al - rol r11d,11 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],4 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - xor r12d,r11d - add r10d,DWORD[12+r15] - add al,dl - mov ebx,DWORD[44+rsi] - add r10d,3572445317 - movzx eax,al - add r10d,r12d - mov DWORD[40+rsi],edx - add cl,bl - rol r10d,16 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],5 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r10d - add r9d,DWORD[24+r15] - add bl,dl - mov eax,DWORD[48+rsi] - add r9d,76029189 - movzx ebx,bl - add r9d,r12d - mov DWORD[44+rsi],edx - add cl,al - rol r9d,23 - mov r12d,r11d - pinsrw xmm1,WORD[rbx*4+rdi],5 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],eax - xor r12d,r9d - add r8d,DWORD[36+r15] - add al,dl - mov ebx,DWORD[52+rsi] - add r8d,3654602809 - movzx eax,al - add r8d,r12d - mov DWORD[48+rsi],edx - add cl,bl - rol r8d,4 - mov r12d,r10d - pinsrw xmm0,WORD[rax*4+rdi],6 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r8d - add r11d,DWORD[48+r15] - add bl,dl - mov eax,DWORD[56+rsi] - add r11d,3873151461 - movzx ebx,bl - add r11d,r12d - mov DWORD[52+rsi],edx - add cl,al - rol r11d,11 - mov r12d,r9d - pinsrw xmm1,WORD[rbx*4+rdi],6 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],eax - xor r12d,r11d - add r10d,DWORD[60+r15] - add al,dl - mov ebx,DWORD[60+rsi] - add r10d,530742520 - movzx eax,al - add r10d,r12d - mov DWORD[56+rsi],edx - add cl,bl - rol r10d,16 - mov r12d,r8d - pinsrw xmm0,WORD[rax*4+rdi],7 - - add r10d,r11d - movdqu xmm4,XMMWORD[32+r13] - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],ebx - xor r12d,r10d - add r9d,DWORD[8+r15] - add bl,dl - mov eax,DWORD[64+rsi] - add r9d,3299628645 - movzx ebx,bl - add r9d,r12d - mov DWORD[60+rsi],edx - add cl,al - rol r9d,23 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],7 - - add r9d,r10d - psllq xmm1,8 - pxor xmm4,xmm0 - pxor xmm4,xmm1 - pxor xmm0,xmm0 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - or r12d,r9d - add r8d,DWORD[r15] - add al,dl - mov ebx,DWORD[68+rsi] - add r8d,4096336452 - movzx eax,al - xor r12d,r10d - mov DWORD[64+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,6 - mov r12d,-1 - movd xmm0,DWORD[rax*4+rdi] - - add r8d,r9d - pxor xmm1,xmm1 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - or r12d,r8d - add r11d,DWORD[28+r15] - add bl,dl - mov eax,DWORD[72+rsi] - add r11d,1126891415 - movzx ebx,bl - xor r12d,r9d - mov DWORD[68+rsi],edx - add r11d,r12d - add cl,al - rol r11d,10 - mov r12d,-1 - movd xmm1,DWORD[rbx*4+rdi] - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - or r12d,r11d - add r10d,DWORD[56+r15] - add al,dl - mov ebx,DWORD[76+rsi] - add r10d,2878612391 - movzx eax,al - xor r12d,r8d - mov DWORD[72+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,15 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],1 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - or r12d,r10d - add r9d,DWORD[20+r15] - add bl,dl - mov eax,DWORD[80+rsi] - add r9d,4237533241 - movzx ebx,bl - xor r12d,r11d - mov DWORD[76+rsi],edx - add r9d,r12d - add cl,al - rol r9d,21 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],1 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - or r12d,r9d - add r8d,DWORD[48+r15] - add al,dl - mov ebx,DWORD[84+rsi] - add r8d,1700485571 - movzx eax,al - xor r12d,r10d - mov DWORD[80+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,6 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],2 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - or r12d,r8d - add r11d,DWORD[12+r15] - add bl,dl - mov eax,DWORD[88+rsi] - add r11d,2399980690 - movzx ebx,bl - xor r12d,r9d - mov DWORD[84+rsi],edx - add r11d,r12d - add cl,al - rol r11d,10 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],2 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - or r12d,r11d - add r10d,DWORD[40+r15] - add al,dl - mov ebx,DWORD[92+rsi] - add r10d,4293915773 - movzx eax,al - xor r12d,r8d - mov DWORD[88+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,15 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],3 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - or r12d,r10d - add r9d,DWORD[4+r15] - add bl,dl - mov eax,DWORD[96+rsi] - add r9d,2240044497 - movzx ebx,bl - xor r12d,r11d - mov DWORD[92+rsi],edx - add r9d,r12d - add cl,al - rol r9d,21 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],3 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - or r12d,r9d - add r8d,DWORD[32+r15] - add al,dl - mov ebx,DWORD[100+rsi] - add r8d,1873313359 - movzx eax,al - xor r12d,r10d - mov DWORD[96+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,6 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],4 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - or r12d,r8d - add r11d,DWORD[60+r15] - add bl,dl - mov eax,DWORD[104+rsi] - add r11d,4264355552 - movzx ebx,bl - xor r12d,r9d - mov DWORD[100+rsi],edx - add r11d,r12d - add cl,al - rol r11d,10 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],4 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - or r12d,r11d - add r10d,DWORD[24+r15] - add al,dl - mov ebx,DWORD[108+rsi] - add r10d,2734768916 - movzx eax,al - xor r12d,r8d - mov DWORD[104+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,15 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],5 - - add r10d,r11d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - or r12d,r10d - add r9d,DWORD[52+r15] - add bl,dl - mov eax,DWORD[112+rsi] - add r9d,1309151649 - movzx ebx,bl - xor r12d,r11d - mov DWORD[108+rsi],edx - add r9d,r12d - add cl,al - rol r9d,21 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],5 - - add r9d,r10d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r11d - mov DWORD[rcx*4+rdi],eax - or r12d,r9d - add r8d,DWORD[16+r15] - add al,dl - mov ebx,DWORD[116+rsi] - add r8d,4149444226 - movzx eax,al - xor r12d,r10d - mov DWORD[112+rsi],edx - add r8d,r12d - add cl,bl - rol r8d,6 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],6 - - add r8d,r9d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r10d - mov DWORD[rcx*4+rdi],ebx - or r12d,r8d - add r11d,DWORD[44+r15] - add bl,dl - mov eax,DWORD[120+rsi] - add r11d,3174756917 - movzx ebx,bl - xor r12d,r9d - mov DWORD[116+rsi],edx - add r11d,r12d - add cl,al - rol r11d,10 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],6 - - add r11d,r8d - mov edx,DWORD[rcx*4+rdi] - xor r12d,r9d - mov DWORD[rcx*4+rdi],eax - or r12d,r11d - add r10d,DWORD[8+r15] - add al,dl - mov ebx,DWORD[124+rsi] - add r10d,718787259 - movzx eax,al - xor r12d,r8d - mov DWORD[120+rsi],edx - add r10d,r12d - add cl,bl - rol r10d,15 - mov r12d,-1 - pinsrw xmm0,WORD[rax*4+rdi],7 - - add r10d,r11d - movdqu xmm5,XMMWORD[48+r13] - add bpl,32 - mov edx,DWORD[rcx*4+rdi] - xor r12d,r8d - mov DWORD[rcx*4+rdi],ebx - or r12d,r10d - add r9d,DWORD[36+r15] - add bl,dl - mov eax,DWORD[rbp*4+rdi] - add r9d,3951481745 - movzx ebx,bl - xor r12d,r11d - mov DWORD[124+rsi],edx - add r9d,r12d - add cl,al - rol r9d,21 - mov r12d,-1 - pinsrw xmm1,WORD[rbx*4+rdi],7 - - add r9d,r10d - mov rsi,rbp - xor rbp,rbp - mov bpl,sil - mov rsi,rcx - xor rcx,rcx - mov cl,sil - lea rsi,[rbp*4+rdi] - psllq xmm1,8 - pxor xmm5,xmm0 - pxor xmm5,xmm1 - add r8d,DWORD[rsp] - add r9d,DWORD[4+rsp] - add r10d,DWORD[8+rsp] - add r11d,DWORD[12+rsp] - - movdqu XMMWORD[r13*1+r14],xmm2 - movdqu XMMWORD[16+r13*1+r14],xmm3 - movdqu XMMWORD[32+r13*1+r14],xmm4 - movdqu XMMWORD[48+r13*1+r14],xmm5 - lea r15,[64+r15] - lea r13,[64+r13] - cmp r15,QWORD[16+rsp] - jb NEAR $L$oop - - mov r12,QWORD[24+rsp] - sub cl,al - mov DWORD[r12],r8d - mov DWORD[4+r12],r9d - mov DWORD[8+r12],r10d - mov DWORD[12+r12],r11d - sub bpl,1 - mov DWORD[((-8))+rdi],ebp - mov DWORD[((-4))+rdi],ecx - - mov r15,QWORD[40+rsp] - mov r14,QWORD[48+rsp] - mov r13,QWORD[56+rsp] - mov r12,QWORD[64+rsp] - mov rbp,QWORD[72+rsp] - mov rbx,QWORD[80+rsp] - lea rsp,[88+rsp] -$L$epilogue: -$L$abort: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_rc4_md5_enc: -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - lea r10,[$L$body] - cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] - - lea r10,[$L$epilogue] - cmp rbx,r10 - jae NEAR $L$in_prologue - - mov r15,QWORD[40+rax] - mov r14,QWORD[48+rax] - mov r13,QWORD[56+rax] - mov r12,QWORD[64+rax] - mov rbp,QWORD[72+rax] - mov rbx,QWORD[80+rax] - lea rax,[88+rax] - - mov QWORD[144+r8],rbx - mov QWORD[160+r8],rbp - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - mov QWORD[232+r8],r14 - mov QWORD[240+r8],r15 - -$L$in_prologue: - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[152+r8],rax - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret - - -section .pdata rdata align=4 -ALIGN 4 - DD $L$SEH_begin_rc4_md5_enc wrt ..imagebase - DD $L$SEH_end_rc4_md5_enc wrt ..imagebase - DD $L$SEH_info_rc4_md5_enc wrt ..imagebase - -section .xdata rdata align=8 -ALIGN 8 -$L$SEH_info_rc4_md5_enc: -DB 9,0,0,0 - DD se_handler wrt ..imagebase diff --git a/packager/third_party/boringssl/win-x86_64/crypto/rc4/rc4-x86_64.asm b/packager/third_party/boringssl/win-x86_64/crypto/rc4/rc4-x86_64.asm deleted file mode 100644 index c7c3b7b6c0..0000000000 --- a/packager/third_party/boringssl/win-x86_64/crypto/rc4/rc4-x86_64.asm +++ /dev/null @@ -1,741 +0,0 @@ -default rel -%define XMMWORD -%define YMMWORD -%define ZMMWORD -section .text code align=64 - -EXTERN OPENSSL_ia32cap_P - -global asm_RC4 - -ALIGN 16 -asm_RC4: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_asm_RC4: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - mov rcx,r9 - - - or rsi,rsi - jne NEAR $L$entry - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$entry: - push rbx - push r12 - push r13 -$L$prologue: - mov r11,rsi - mov r12,rdx - mov r13,rcx - xor r10,r10 - xor rcx,rcx - - lea rdi,[8+rdi] - mov r10b,BYTE[((-8))+rdi] - mov cl,BYTE[((-4))+rdi] - cmp DWORD[256+rdi],-1 - je NEAR $L$RC4_CHAR - mov r8d,DWORD[OPENSSL_ia32cap_P] - xor rbx,rbx - inc r10b - sub rbx,r10 - sub r13,r12 - mov eax,DWORD[r10*4+rdi] - test r11,-16 - jz NEAR $L$loop1 - bt r8d,30 - jc NEAR $L$intel - and rbx,7 - lea rsi,[1+r10] - jz NEAR $L$oop8 - sub r11,rbx -$L$oop8_warmup: - add cl,al - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - mov DWORD[r10*4+rdi],edx - add al,dl - inc r10b - mov edx,DWORD[rax*4+rdi] - mov eax,DWORD[r10*4+rdi] - xor dl,BYTE[r12] - mov BYTE[r13*1+r12],dl - lea r12,[1+r12] - dec rbx - jnz NEAR $L$oop8_warmup - - lea rsi,[1+r10] - jmp NEAR $L$oop8 -ALIGN 16 -$L$oop8: - add cl,al - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - mov ebx,DWORD[rsi*4+rdi] - ror r8,8 - mov DWORD[r10*4+rdi],edx - add dl,al - mov r8b,BYTE[rdx*4+rdi] - add cl,bl - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - mov eax,DWORD[4+rsi*4+rdi] - ror r8,8 - mov DWORD[4+r10*4+rdi],edx - add dl,bl - mov r8b,BYTE[rdx*4+rdi] - add cl,al - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - mov ebx,DWORD[8+rsi*4+rdi] - ror r8,8 - mov DWORD[8+r10*4+rdi],edx - add dl,al - mov r8b,BYTE[rdx*4+rdi] - add cl,bl - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - mov eax,DWORD[12+rsi*4+rdi] - ror r8,8 - mov DWORD[12+r10*4+rdi],edx - add dl,bl - mov r8b,BYTE[rdx*4+rdi] - add cl,al - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - mov ebx,DWORD[16+rsi*4+rdi] - ror r8,8 - mov DWORD[16+r10*4+rdi],edx - add dl,al - mov r8b,BYTE[rdx*4+rdi] - add cl,bl - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - mov eax,DWORD[20+rsi*4+rdi] - ror r8,8 - mov DWORD[20+r10*4+rdi],edx - add dl,bl - mov r8b,BYTE[rdx*4+rdi] - add cl,al - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - mov ebx,DWORD[24+rsi*4+rdi] - ror r8,8 - mov DWORD[24+r10*4+rdi],edx - add dl,al - mov r8b,BYTE[rdx*4+rdi] - add sil,8 - add cl,bl - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - mov eax,DWORD[((-4))+rsi*4+rdi] - ror r8,8 - mov DWORD[28+r10*4+rdi],edx - add dl,bl - mov r8b,BYTE[rdx*4+rdi] - add r10b,8 - ror r8,8 - sub r11,8 - - xor r8,QWORD[r12] - mov QWORD[r13*1+r12],r8 - lea r12,[8+r12] - - test r11,-8 - jnz NEAR $L$oop8 - cmp r11,0 - jne NEAR $L$loop1 - jmp NEAR $L$exit - -ALIGN 16 -$L$intel: - test r11,-32 - jz NEAR $L$loop1 - and rbx,15 - jz NEAR $L$oop16_is_hot - sub r11,rbx -$L$oop16_warmup: - add cl,al - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - mov DWORD[r10*4+rdi],edx - add al,dl - inc r10b - mov edx,DWORD[rax*4+rdi] - mov eax,DWORD[r10*4+rdi] - xor dl,BYTE[r12] - mov BYTE[r13*1+r12],dl - lea r12,[1+r12] - dec rbx - jnz NEAR $L$oop16_warmup - - mov rbx,rcx - xor rcx,rcx - mov cl,bl - -$L$oop16_is_hot: - lea rsi,[r10*4+rdi] - add cl,al - mov edx,DWORD[rcx*4+rdi] - pxor xmm0,xmm0 - mov DWORD[rcx*4+rdi],eax - add al,dl - mov ebx,DWORD[4+rsi] - movzx eax,al - mov DWORD[rsi],edx - add cl,bl - pinsrw xmm0,WORD[rax*4+rdi],0 - jmp NEAR $L$oop16_enter -ALIGN 16 -$L$oop16: - add cl,al - mov edx,DWORD[rcx*4+rdi] - pxor xmm2,xmm0 - psllq xmm1,8 - pxor xmm0,xmm0 - mov DWORD[rcx*4+rdi],eax - add al,dl - mov ebx,DWORD[4+rsi] - movzx eax,al - mov DWORD[rsi],edx - pxor xmm2,xmm1 - add cl,bl - pinsrw xmm0,WORD[rax*4+rdi],0 - movdqu XMMWORD[r13*1+r12],xmm2 - lea r12,[16+r12] -$L$oop16_enter: - mov edx,DWORD[rcx*4+rdi] - pxor xmm1,xmm1 - mov DWORD[rcx*4+rdi],ebx - add bl,dl - mov eax,DWORD[8+rsi] - movzx ebx,bl - mov DWORD[4+rsi],edx - add cl,al - pinsrw xmm1,WORD[rbx*4+rdi],0 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - add al,dl - mov ebx,DWORD[12+rsi] - movzx eax,al - mov DWORD[8+rsi],edx - add cl,bl - pinsrw xmm0,WORD[rax*4+rdi],1 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - add bl,dl - mov eax,DWORD[16+rsi] - movzx ebx,bl - mov DWORD[12+rsi],edx - add cl,al - pinsrw xmm1,WORD[rbx*4+rdi],1 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - add al,dl - mov ebx,DWORD[20+rsi] - movzx eax,al - mov DWORD[16+rsi],edx - add cl,bl - pinsrw xmm0,WORD[rax*4+rdi],2 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - add bl,dl - mov eax,DWORD[24+rsi] - movzx ebx,bl - mov DWORD[20+rsi],edx - add cl,al - pinsrw xmm1,WORD[rbx*4+rdi],2 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - add al,dl - mov ebx,DWORD[28+rsi] - movzx eax,al - mov DWORD[24+rsi],edx - add cl,bl - pinsrw xmm0,WORD[rax*4+rdi],3 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - add bl,dl - mov eax,DWORD[32+rsi] - movzx ebx,bl - mov DWORD[28+rsi],edx - add cl,al - pinsrw xmm1,WORD[rbx*4+rdi],3 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - add al,dl - mov ebx,DWORD[36+rsi] - movzx eax,al - mov DWORD[32+rsi],edx - add cl,bl - pinsrw xmm0,WORD[rax*4+rdi],4 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - add bl,dl - mov eax,DWORD[40+rsi] - movzx ebx,bl - mov DWORD[36+rsi],edx - add cl,al - pinsrw xmm1,WORD[rbx*4+rdi],4 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - add al,dl - mov ebx,DWORD[44+rsi] - movzx eax,al - mov DWORD[40+rsi],edx - add cl,bl - pinsrw xmm0,WORD[rax*4+rdi],5 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - add bl,dl - mov eax,DWORD[48+rsi] - movzx ebx,bl - mov DWORD[44+rsi],edx - add cl,al - pinsrw xmm1,WORD[rbx*4+rdi],5 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - add al,dl - mov ebx,DWORD[52+rsi] - movzx eax,al - mov DWORD[48+rsi],edx - add cl,bl - pinsrw xmm0,WORD[rax*4+rdi],6 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - add bl,dl - mov eax,DWORD[56+rsi] - movzx ebx,bl - mov DWORD[52+rsi],edx - add cl,al - pinsrw xmm1,WORD[rbx*4+rdi],6 - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - add al,dl - mov ebx,DWORD[60+rsi] - movzx eax,al - mov DWORD[56+rsi],edx - add cl,bl - pinsrw xmm0,WORD[rax*4+rdi],7 - add r10b,16 - movdqu xmm2,XMMWORD[r12] - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],ebx - add bl,dl - movzx ebx,bl - mov DWORD[60+rsi],edx - lea rsi,[r10*4+rdi] - pinsrw xmm1,WORD[rbx*4+rdi],7 - mov eax,DWORD[rsi] - mov rbx,rcx - xor rcx,rcx - sub r11,16 - mov cl,bl - test r11,-16 - jnz NEAR $L$oop16 - - psllq xmm1,8 - pxor xmm2,xmm0 - pxor xmm2,xmm1 - movdqu XMMWORD[r13*1+r12],xmm2 - lea r12,[16+r12] - - cmp r11,0 - jne NEAR $L$loop1 - jmp NEAR $L$exit - -ALIGN 16 -$L$loop1: - add cl,al - mov edx,DWORD[rcx*4+rdi] - mov DWORD[rcx*4+rdi],eax - mov DWORD[r10*4+rdi],edx - add al,dl - inc r10b - mov edx,DWORD[rax*4+rdi] - mov eax,DWORD[r10*4+rdi] - xor dl,BYTE[r12] - mov BYTE[r13*1+r12],dl - lea r12,[1+r12] - dec r11 - jnz NEAR $L$loop1 - jmp NEAR $L$exit - -ALIGN 16 -$L$RC4_CHAR: - add r10b,1 - movzx eax,BYTE[r10*1+rdi] - test r11,-8 - jz NEAR $L$cloop1 - jmp NEAR $L$cloop8 -ALIGN 16 -$L$cloop8: - mov r8d,DWORD[r12] - mov r9d,DWORD[4+r12] - add cl,al - lea rsi,[1+r10] - movzx edx,BYTE[rcx*1+rdi] - movzx esi,sil - movzx ebx,BYTE[rsi*1+rdi] - mov BYTE[rcx*1+rdi],al - cmp rcx,rsi - mov BYTE[r10*1+rdi],dl - jne NEAR $L$cmov0 - mov rbx,rax -$L$cmov0: - add dl,al - xor r8b,BYTE[rdx*1+rdi] - ror r8d,8 - add cl,bl - lea r10,[1+rsi] - movzx edx,BYTE[rcx*1+rdi] - movzx r10d,r10b - movzx eax,BYTE[r10*1+rdi] - mov BYTE[rcx*1+rdi],bl - cmp rcx,r10 - mov BYTE[rsi*1+rdi],dl - jne NEAR $L$cmov1 - mov rax,rbx -$L$cmov1: - add dl,bl - xor r8b,BYTE[rdx*1+rdi] - ror r8d,8 - add cl,al - lea rsi,[1+r10] - movzx edx,BYTE[rcx*1+rdi] - movzx esi,sil - movzx ebx,BYTE[rsi*1+rdi] - mov BYTE[rcx*1+rdi],al - cmp rcx,rsi - mov BYTE[r10*1+rdi],dl - jne NEAR $L$cmov2 - mov rbx,rax -$L$cmov2: - add dl,al - xor r8b,BYTE[rdx*1+rdi] - ror r8d,8 - add cl,bl - lea r10,[1+rsi] - movzx edx,BYTE[rcx*1+rdi] - movzx r10d,r10b - movzx eax,BYTE[r10*1+rdi] - mov BYTE[rcx*1+rdi],bl - cmp rcx,r10 - mov BYTE[rsi*1+rdi],dl - jne NEAR $L$cmov3 - mov rax,rbx -$L$cmov3: - add dl,bl - xor r8b,BYTE[rdx*1+rdi] - ror r8d,8 - add cl,al - lea rsi,[1+r10] - movzx edx,BYTE[rcx*1+rdi] - movzx esi,sil - movzx ebx,BYTE[rsi*1+rdi] - mov BYTE[rcx*1+rdi],al - cmp rcx,rsi - mov BYTE[r10*1+rdi],dl - jne NEAR $L$cmov4 - mov rbx,rax -$L$cmov4: - add dl,al - xor r9b,BYTE[rdx*1+rdi] - ror r9d,8 - add cl,bl - lea r10,[1+rsi] - movzx edx,BYTE[rcx*1+rdi] - movzx r10d,r10b - movzx eax,BYTE[r10*1+rdi] - mov BYTE[rcx*1+rdi],bl - cmp rcx,r10 - mov BYTE[rsi*1+rdi],dl - jne NEAR $L$cmov5 - mov rax,rbx -$L$cmov5: - add dl,bl - xor r9b,BYTE[rdx*1+rdi] - ror r9d,8 - add cl,al - lea rsi,[1+r10] - movzx edx,BYTE[rcx*1+rdi] - movzx esi,sil - movzx ebx,BYTE[rsi*1+rdi] - mov BYTE[rcx*1+rdi],al - cmp rcx,rsi - mov BYTE[r10*1+rdi],dl - jne NEAR $L$cmov6 - mov rbx,rax -$L$cmov6: - add dl,al - xor r9b,BYTE[rdx*1+rdi] - ror r9d,8 - add cl,bl - lea r10,[1+rsi] - movzx edx,BYTE[rcx*1+rdi] - movzx r10d,r10b - movzx eax,BYTE[r10*1+rdi] - mov BYTE[rcx*1+rdi],bl - cmp rcx,r10 - mov BYTE[rsi*1+rdi],dl - jne NEAR $L$cmov7 - mov rax,rbx -$L$cmov7: - add dl,bl - xor r9b,BYTE[rdx*1+rdi] - ror r9d,8 - lea r11,[((-8))+r11] - mov DWORD[r13],r8d - lea r12,[8+r12] - mov DWORD[4+r13],r9d - lea r13,[8+r13] - - test r11,-8 - jnz NEAR $L$cloop8 - cmp r11,0 - jne NEAR $L$cloop1 - jmp NEAR $L$exit -ALIGN 16 -$L$cloop1: - add cl,al - movzx ecx,cl - movzx edx,BYTE[rcx*1+rdi] - mov BYTE[rcx*1+rdi],al - mov BYTE[r10*1+rdi],dl - add dl,al - add r10b,1 - movzx edx,dl - movzx r10d,r10b - movzx edx,BYTE[rdx*1+rdi] - movzx eax,BYTE[r10*1+rdi] - xor dl,BYTE[r12] - lea r12,[1+r12] - mov BYTE[r13],dl - lea r13,[1+r13] - sub r11,1 - jnz NEAR $L$cloop1 - jmp NEAR $L$exit - -ALIGN 16 -$L$exit: - sub r10b,1 - mov DWORD[((-8))+rdi],r10d - mov DWORD[((-4))+rdi],ecx - - mov r13,QWORD[rsp] - mov r12,QWORD[8+rsp] - mov rbx,QWORD[16+rsp] - add rsp,24 -$L$epilogue: - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_asm_RC4: -global asm_RC4_set_key - -ALIGN 16 -asm_RC4_set_key: - mov QWORD[8+rsp],rdi ;WIN64 prologue - mov QWORD[16+rsp],rsi - mov rax,rsp -$L$SEH_begin_asm_RC4_set_key: - mov rdi,rcx - mov rsi,rdx - mov rdx,r8 - - - lea rdi,[8+rdi] - lea rdx,[rsi*1+rdx] - neg rsi - mov rcx,rsi - xor eax,eax - xor r9,r9 - xor r10,r10 - xor r11,r11 - - mov r8d,DWORD[OPENSSL_ia32cap_P] - bt r8d,20 - jc NEAR $L$c1stloop - jmp NEAR $L$w1stloop - -ALIGN 16 -$L$w1stloop: - mov DWORD[rax*4+rdi],eax - add al,1 - jnc NEAR $L$w1stloop - - xor r9,r9 - xor r8,r8 -ALIGN 16 -$L$w2ndloop: - mov r10d,DWORD[r9*4+rdi] - add r8b,BYTE[rsi*1+rdx] - add r8b,r10b - add rsi,1 - mov r11d,DWORD[r8*4+rdi] - cmovz rsi,rcx - mov DWORD[r8*4+rdi],r10d - mov DWORD[r9*4+rdi],r11d - add r9b,1 - jnc NEAR $L$w2ndloop - jmp NEAR $L$exit_key - -ALIGN 16 -$L$c1stloop: - mov BYTE[rax*1+rdi],al - add al,1 - jnc NEAR $L$c1stloop - - xor r9,r9 - xor r8,r8 -ALIGN 16 -$L$c2ndloop: - mov r10b,BYTE[r9*1+rdi] - add r8b,BYTE[rsi*1+rdx] - add r8b,r10b - add rsi,1 - mov r11b,BYTE[r8*1+rdi] - jnz NEAR $L$cnowrap - mov rsi,rcx -$L$cnowrap: - mov BYTE[r8*1+rdi],r10b - mov BYTE[r9*1+rdi],r11b - add r9b,1 - jnc NEAR $L$c2ndloop - mov DWORD[256+rdi],-1 - -ALIGN 16 -$L$exit_key: - xor eax,eax - mov DWORD[((-8))+rdi],eax - mov DWORD[((-4))+rdi],eax - mov rdi,QWORD[8+rsp] ;WIN64 epilogue - mov rsi,QWORD[16+rsp] - DB 0F3h,0C3h ;repret -$L$SEH_end_asm_RC4_set_key: -EXTERN __imp_RtlVirtualUnwind - -ALIGN 16 -stream_se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[120+r8] - mov rbx,QWORD[248+r8] - - lea r10,[$L$prologue] - cmp rbx,r10 - jb NEAR $L$in_prologue - - mov rax,QWORD[152+r8] - - lea r10,[$L$epilogue] - cmp rbx,r10 - jae NEAR $L$in_prologue - - lea rax,[24+rax] - - mov rbx,QWORD[((-8))+rax] - mov r12,QWORD[((-16))+rax] - mov r13,QWORD[((-24))+rax] - mov QWORD[144+r8],rbx - mov QWORD[216+r8],r12 - mov QWORD[224+r8],r13 - -$L$in_prologue: - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[152+r8],rax - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - - jmp NEAR $L$common_seh_exit - - - -ALIGN 16 -key_se_handler: - push rsi - push rdi - push rbx - push rbp - push r12 - push r13 - push r14 - push r15 - pushfq - sub rsp,64 - - mov rax,QWORD[152+r8] - mov rdi,QWORD[8+rax] - mov rsi,QWORD[16+rax] - mov QWORD[168+r8],rsi - mov QWORD[176+r8],rdi - -$L$common_seh_exit: - - mov rdi,QWORD[40+r9] - mov rsi,r8 - mov ecx,154 - DD 0xa548f3fc - - mov rsi,r9 - xor rcx,rcx - mov rdx,QWORD[8+rsi] - mov r8,QWORD[rsi] - mov r9,QWORD[16+rsi] - mov r10,QWORD[40+rsi] - lea r11,[56+rsi] - lea r12,[24+rsi] - mov QWORD[32+rsp],r10 - mov QWORD[40+rsp],r11 - mov QWORD[48+rsp],r12 - mov QWORD[56+rsp],rcx - call QWORD[__imp_RtlVirtualUnwind] - - mov eax,1 - add rsp,64 - popfq - pop r15 - pop r14 - pop r13 - pop r12 - pop rbp - pop rbx - pop rdi - pop rsi - DB 0F3h,0C3h ;repret - - -section .pdata rdata align=4 -ALIGN 4 - DD $L$SEH_begin_asm_RC4 wrt ..imagebase - DD $L$SEH_end_asm_RC4 wrt ..imagebase - DD $L$SEH_info_asm_RC4 wrt ..imagebase - - DD $L$SEH_begin_asm_RC4_set_key wrt ..imagebase - DD $L$SEH_end_asm_RC4_set_key wrt ..imagebase - DD $L$SEH_info_asm_RC4_set_key wrt ..imagebase - -section .xdata rdata align=8 -ALIGN 8 -$L$SEH_info_asm_RC4: -DB 9,0,0,0 - DD stream_se_handler wrt ..imagebase -$L$SEH_info_asm_RC4_set_key: -DB 9,0,0,0 - DD key_se_handler wrt ..imagebase